third_party/opus/src/silk/x86/NSQ_del_dec_sse.c - Issue 2195313002: Remove Opus from DEPS and import a local copy

Side by Side Diff: third_party/opus/src/silk/x86/NSQ_del_dec_sse.c

Issue 2195313002: Remove Opus from DEPS and import a local copy (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Local copy of opus, opus/src/.gitignore, opus/DEPS, update README.chromium Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /* Copyright (c) 2014, Cisco Systems, INC

	2 Written by XiangMingZhu WeiZhou MinPeng YanWang

	3

	4 Redistribution and use in source and binary forms, with or without

	5 modification, are permitted provided that the following conditions

	6 are met:

	7

	8 - Redistributions of source code must retain the above copyright

	9 notice, this list of conditions and the following disclaimer.

	10

	11 - Redistributions in binary form must reproduce the above copyright

	12 notice, this list of conditions and the following disclaimer in the

	13 documentation and/or other materials provided with the distribution.

	14

	15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

	19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

	20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

	21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

	22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

	23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

	24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

	25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	26 */

	27

	28 #ifdef HAVE_CONFIG_H

	29 #include "config.h"

	30 #endif

	31

	32 #include <xmmintrin.h>

	33 #include <emmintrin.h>

	34 #include <smmintrin.h>

	35 #include "main.h"

	36 #include "celt/x86/x86cpu.h"

	37

	38 #include "stack_alloc.h"

	39

	40 typedef struct {

	41 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];

	42 opus_int32 RandState[ DECISION_DELAY ];

	43 opus_int32 Q_Q10[ DECISION_DELAY ];

	44 opus_int32 Xq_Q14[ DECISION_DELAY ];

	45 opus_int32 Pred_Q15[ DECISION_DELAY ];

	46 opus_int32 Shape_Q14[ DECISION_DELAY ];

	47 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];

	48 opus_int32 LF_AR_Q14;

	49 opus_int32 Seed;

	50 opus_int32 SeedInit;

	51 opus_int32 RD_Q10;

	52 } NSQ_del_dec_struct;

	53

	54 typedef struct {

	55 opus_int32 Q_Q10;

	56 opus_int32 RD_Q10;

	57 opus_int32 xq_Q14;

	58 opus_int32 LF_AR_Q14;

	59 opus_int32 sLTP_shp_Q14;

	60 opus_int32 LPC_exc_Q14;

	61 } NSQ_sample_struct;

	62

	63 typedef NSQ_sample_struct NSQ_sample_pair[ 2 ];

	64

	65 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(

	66 const silk_encoder_state psEncC, / I Encoder State */

	67 silk_nsq_state NSQ, / I/O NSQ state */

	68 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision sta tes */

	69 const opus_int32 x_Q3[], /* I Input in Q3 */

	70 opus_int32 x_sc_Q10[], /* O Input scaled with 1/ Gain in Q10 */

	71 const opus_int16 sLTP[], /* I Re-whitened LTP stat e in Q0 */

	72 opus_int32 sLTP_Q15[], /* O LTP state matching s caled input */

	73 opus_int subfr, /* I Subframe number */

	74 opus_int nStatesDelayedDecision, /* I Number of del dec st ates */

	75 const opus_int LTP_scale_Q14, /* I LTP state scaling */

	76 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */

	77 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */

	78 const opus_int signal_type, /* I Signal type */

	79 const opus_int decisionDelay /* I Decision delay */

	80 );

	81

	82 /******************************************/

	83 /* Noise shape quantizer for one subframe */

	84 /******************************************/

	85 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(

	86 silk_nsq_state NSQ, / I/O NSQ state */

	87 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */

	88 opus_int signalType, /* I Signal type */

	89 const opus_int32 x_Q10[], /* I */

	90 opus_int8 pulses[], /* O */

	91 opus_int16 xq[], /* O */

	92 opus_int32 sLTP_Q15[], /* I/O LTP filter state */

	93 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */

	94 const opus_int16 a_Q12[], /* I Short term prediction co efs */

	95 const opus_int16 b_Q14[], /* I Long term prediction coe fs */

	96 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */

	97 opus_int lag, /* I Pitch lag */

	98 opus_int32 HarmShapeFIRPacked_Q14, /* I */

	99 opus_int Tilt_Q14, /* I Spectral tilt */

	100 opus_int32 LF_shp_Q14, /* I */

	101 opus_int32 Gain_Q16, /* I */

	102 opus_int Lambda_Q10, /* I */

	103 opus_int offset_Q10, /* I */

	104 opus_int length, /* I Input length */

	105 opus_int subfr, /* I Subframe number */

	106 opus_int shapingLPCOrder, /* I Shaping LPC filter order */

	107 opus_int predictLPCOrder, /* I Prediction filter order */

	108 opus_int warping_Q16, /* I */

	109 opus_int nStatesDelayedDecision, /* I Number of states in deci sion tree */

	110 opus_int smpl_buf_idx, / I Index to newest samples in buffers */

	111 opus_int decisionDelay /* I */

	112 );

	113

	114 void silk_NSQ_del_dec_sse4_1(

	115 const silk_encoder_state psEncC, / I /O Encoder State */

	116 silk_nsq_state NSQ, / I /O NSQ state */

	117 SideInfoIndices psIndices, / I /O Quantization Indices */

	118 const opus_int32 x_Q3[], /* I Prefiltered input signal */

	119 opus_int8 pulses[], /* O Quantized pulse signal */

	120 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */

	121 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */

	122 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], / * I Noise shaping coefs */

	123 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */

	124 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */

	125 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */

	126 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */

	127 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */

	128 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */

	129 const opus_int LTP_scale_Q14 /* I LTP state scaling */

	130 )

	131 {

	132 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind , subfr;

	133 opus_int last_smple_idx, smpl_buf_idx, decisionDelay;

	134 const opus_int16 A_Q12, B_Q14, *AR_shp_Q13;

	135 opus_int16 *pxq;

	136 VARDECL( opus_int32, sLTP_Q15 );

	137 VARDECL( opus_int16, sLTP );

	138 opus_int32 HarmShapeFIRPacked_Q14;

	139 opus_int offset_Q10;

	140 opus_int32 RDmin_Q10, Gain_Q10;

	141 VARDECL( opus_int32, x_sc_Q10 );

	142 VARDECL( opus_int32, delayedGain_Q10 );

	143 VARDECL( NSQ_del_dec_struct, psDelDec );

	144 NSQ_del_dec_struct *psDD;

	145 SAVE_STACK;

	146

	147 /* Set unvoiced lag to the previous one, overwrite later for voiced */

	148 lag = NSQ->lagPrev;

	149

	150 silk_assert( NSQ->prev_gain_Q16 != 0 );

	151

	152 /* Initialize delayed decision states */

	153 ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct );

	154 silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_d ec_struct ) );

	155 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {

	156 psDD = &psDelDec[ k ];

	157 psDD->Seed = ( k + psIndices->Seed ) & 3;

	158 psDD->SeedInit = psDD->Seed;

	159 psDD->RD_Q10 = 0;

	160 psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14;

	161 psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];

	162 silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );

	163 silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );

	164 }

	165

	166 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];

	167 smpl_buf_idx = 0; /* index of oldest samples */

	168

	169 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );

	170

	171 /* For voiced frames limit the decision delay to lower than the pitch lag */

	172 if( psIndices->signalType == TYPE_VOICED ) {

	173 for( k = 0; k < psEncC->nb_subfr; k++ ) {

	174 decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 );

	175 }

	176 } else {

	177 if( lag > 0 ) {

	178 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );

	179 }

	180 }

	181

	182 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {

	183 LSF_interpolation_flag = 0;

	184 } else {

	185 LSF_interpolation_flag = 1;

	186 }

	187

	188 ALLOC( sLTP_Q15,

	189 psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );

	190 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );

	191 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );

	192 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );

	193 /* Set up pointers to start of sub frame */

	194 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];

	195 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;

	196 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;

	197 subfr = 0;

	198 for( k = 0; k < psEncC->nb_subfr; k++ ) {

	199 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) \| ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];

	200 B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];

	201 AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];

	202

	203 /* Noise shape parameters */

	204 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );

	205 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShap eGain_Q14[ k ], 2 );

	206 HarmShapeFIRPacked_Q14 \|= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShap eGain_Q14[ k ], 1 ), 16 );

	207

	208 NSQ->rewhite_flag = 0;

	209 if( psIndices->signalType == TYPE_VOICED ) {

	210 /* Voiced */

	211 lag = pitchL[ k ];

	212

	213 /* Re-whitening */

	214 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {

	215 if( k == 2 ) {

	216 /* RESET DELAYED DECISIONS */

	217 /* Find winner */

	218 RDmin_Q10 = psDelDec[ 0 ].RD_Q10;

	219 Winner_ind = 0;

	220 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {

	221 if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) {

	222 RDmin_Q10 = psDelDec[ i ].RD_Q10;

	223 Winner_ind = i;

	224 }

	225 }

	226 for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) {

	227 if( i != Winner_ind ) {

	228 psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 );

	229 silk_assert( psDelDec[ i ].RD_Q10 >= 0 );

	230 }

	231 }

	232

	233 /* Copy final part of signals from winner state to output an d long-term filter states */

	234 psDD = &psDelDec[ Winner_ind ];

	235 last_smple_idx = smpl_buf_idx + decisionDelay;

	236 for( i = 0; i < decisionDelay; i++ ) {

	237 last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY _MASK;

	238 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_R OUND( psDD->Q_Q10[ last_smple_idx ], 10 );

	239 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_ RSHIFT_ROUND(

	240 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q 16[ 1 ] ), 14 ) );

	241 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];

	242 }

	243

	244 subfr = 0;

	245 }

	246

	247 /* Rewhiten with new A coefs */

	248 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrd er - LTP_ORDER / 2;

	249 silk_assert( start_idx > 0 );

	250

	251 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_id x + k * psEncC->subfr_length ],

	252 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLP COrder, psEncC->arch );

	253

	254 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;

	255 NSQ->rewhite_flag = 1;

	256 }

	257 }

	258

	259 silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_ Q10, sLTP, sLTP_Q15, k,

	260 psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, ps Indices->signalType, decisionDelay );

	261

	262 silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->sig nalType, x_sc_Q10, pulses, pxq, sLTP_Q15,

	263 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q 14, Tilt_Q14[ k ], LF_shp_Q14[ k ],

	264 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr+ +, psEncC->shapingLPCOrder,

	265 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayed Decision, &smpl_buf_idx, decisionDelay );

	266

	267 x_Q3 += psEncC->subfr_length;

	268 pulses += psEncC->subfr_length;

	269 pxq += psEncC->subfr_length;

	270 }

	271

	272 /* Find winner */

	273 RDmin_Q10 = psDelDec[ 0 ].RD_Q10;

	274 Winner_ind = 0;

	275 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {

	276 if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) {

	277 RDmin_Q10 = psDelDec[ k ].RD_Q10;

	278 Winner_ind = k;

	279 }

	280 }

	281

	282 /* Copy final part of signals from winner state to output and long-term filt er states */

	283 psDD = &psDelDec[ Winner_ind ];

	284 psIndices->Seed = psDD->SeedInit;

	285 last_smple_idx = smpl_buf_idx + decisionDelay;

	286 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );

	287 for( i = 0; i < decisionDelay; i++ ) {

	288 last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK;

	289 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q1 0[ last_smple_idx ], 10 );

	290 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(

	291 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );

	292 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->S hape_Q14[ last_smple_idx ];

	293 }

	294 silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC _BUF_LENGTH * sizeof( opus_int32 ) );

	295 silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) );

	296

	297 /* Update states */

	298 NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;

	299 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];

	300

	301 /* Save quantized speech signal */

	302 /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[psEncC->ltp_mem_length], psEncC->fram e_length * sizeof( opus_int16 ) ) */

	303 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );

	304 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );

	305 RESTORE_STACK;

	306 }

	307

	308 /******************************************/

	309 /* Noise shape quantizer for one subframe */

	310 /******************************************/

	311 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(

	312 silk_nsq_state NSQ, / I/O NSQ state */

	313 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */

	314 opus_int signalType, /* I Signal type */

	315 const opus_int32 x_Q10[], /* I */

	316 opus_int8 pulses[], /* O */

	317 opus_int16 xq[], /* O */

	318 opus_int32 sLTP_Q15[], /* I/O LTP filter state */

	319 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */

	320 const opus_int16 a_Q12[], /* I Short term prediction co efs */

	321 const opus_int16 b_Q14[], /* I Long term prediction coe fs */

	322 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */

	323 opus_int lag, /* I Pitch lag */

	324 opus_int32 HarmShapeFIRPacked_Q14, /* I */

	325 opus_int Tilt_Q14, /* I Spectral tilt */

	326 opus_int32 LF_shp_Q14, /* I */

	327 opus_int32 Gain_Q16, /* I */

	328 opus_int Lambda_Q10, /* I */

	329 opus_int offset_Q10, /* I */

	330 opus_int length, /* I Input length */

	331 opus_int subfr, /* I Subframe number */

	332 opus_int shapingLPCOrder, /* I Shaping LPC filter order */

	333 opus_int predictLPCOrder, /* I Prediction filter order */

	334 opus_int warping_Q16, /* I */

	335 opus_int nStatesDelayedDecision, /* I Number of states in deci sion tree */

	336 opus_int smpl_buf_idx, / I Index to newest samples in buffers */

	337 opus_int decisionDelay /* I */

	338 )

	339 {

	340 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;

	341 opus_int32 Winner_rand_state;

	342 opus_int32 LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;

	343 opus_int32 n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10 ;

	344 opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;

	345 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;

	346 opus_int32 pred_lag_ptr, shp_lag_ptr, *psLPC_Q14;

	347 VARDECL( NSQ_sample_pair, psSampleState );

	348 NSQ_del_dec_struct *psDD;

	349 NSQ_sample_struct *psSS;

	350

	351 __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF;

	352 __m128i b_Q12_0123, b_sr_Q12_0123;

	353 SAVE_STACK;

	354

	355 silk_assert( nStatesDelayedDecision > 0 );

	356 ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );

	357

	358 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_ FIR_TAPS / 2 ];

	359 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];

	360 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );

	361

	362 a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 );

	363 a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 );

	364

	365 if( opus_likely( predictLPCOrder == 16 ) ) {

	366 a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 );

	367 a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 );

	368 }

	369

	370 if( signalType == TYPE_VOICED ){

	371 b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 );

	372 b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */

	373 }

	374 for( i = 0; i < length; i++ ) {

	375 /* Perform common calculations used in all states */

	376

	377 /* Long-term prediction */

	378 if( signalType == TYPE_VOICED ) {

	379 /* Unrolled loop */

	380 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */

	381 LTP_pred_Q14 = 2;

	382 {

	383 __m128i tmpa, tmpb, pred_lag_ptr_tmp;

	384 pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr [ -3 ] ) );

	385 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );

	386 tmpa = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_012 3 );

	387 tmpa = _mm_srli_si128( tmpa, 2 );

	388

	389 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUF FLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */

	390 pred_lag_ptr_tmp = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_ 0123 );

	391 pred_lag_ptr_tmp = _mm_srli_si128( pred_lag_ptr_tmp, 2 );

	392 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpa );

	393

	394 tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3 , 2 ) );/* equal shift right 8 bytes */

	395 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpb );

	396 LTP_pred_Q14 += _mm_cvtsi128_si32( pred_lag_ptr_tmp );

	397

	398 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_ Q14[ 4 ] );

	399 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */

	400 pred_lag_ptr++;

	401 }

	402 } else {

	403 LTP_pred_Q14 = 0;

	404 }

	405

	406 /* Long-term shaping */

	407 if( lag > 0 ) {

	408 /* Symmetric, packed FIR coefficients */

	409 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );

	410 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );

	411 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */

	412 shp_lag_ptr++;

	413 } else {

	414 n_LTP_Q14 = 0;

	415 }

	416 {

	417 __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp;

	418

	419 for( k = 0; k < nStatesDelayedDecision; k++ ) {

	420 /* Delayed decision state */

	421 psDD = &psDelDec[ k ];

	422

	423 /* Sample state */

	424 psSS = psSampleState[ k ];

	425

	426 /* Generate dither */

	427 psDD->Seed = silk_RAND( psDD->Seed );

	428

	429 /* Pointer used in short term prediction and shaping */

	430 psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];

	431 /* Short-term prediction */

	432 silk_assert( predictLPCOrder == 10 \|\| predictLPCOrder == 16 );

	433 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */

	434 LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );

	435

	436 tmpb = _mm_setzero_si128();

	437

	438 /* step 1 */

	439 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i )(&psLPC_Q14[ -3 ] ) ); / -3, -2 , -1, 0 */

	440 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); /* 0, -1, -2, -3 */

	441 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 ); /* 0, -1, -2, -3 * 0123 -> 00, 2-2 */

	442

	443 tmpa = _mm_srli_epi64( tmpa, 16 );

	444 tmpb = _mm_add_epi32( tmpb, tmpa );

	445

	446 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0 , 3, 2, 1 ) ); /* equal shift right 4 bytes */

	447 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */

	448 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1-1, 3-3 */

	449 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );

	450 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );

	451

	452 /* step 2 */

	453 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) );

	454 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );

	455 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );

	456 tmpa = _mm_srli_epi64( tmpa, 16 );

	457 tmpb = _mm_add_epi32( tmpb, tmpa );

	458

	459 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0 , 3, 2, 1 ) ); /* equal shift right 4 bytes */

	460 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */

	461 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );

	462 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );

	463 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );

	464

	465 if ( opus_likely( predictLPCOrder == 16 ) )

	466 {

	467 /* step 3 */

	468 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ - 11 ] ) );

	469 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );

	470 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB ) ;

	471 tmpa = _mm_srli_epi64( tmpa, 16 );

	472 tmpb = _mm_add_epi32( tmpb, tmpa );

	473

	474 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFL E( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */

	475 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */

	476 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );

	477 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );

	478 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );

	479

	480 /* setp 4 */

	481 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ - 15 ] ) );

	482 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );

	483 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF ) ;

	484 tmpa = _mm_srli_epi64( tmpa, 16 );

	485 tmpb = _mm_add_epi32( tmpb, tmpa );

	486

	487 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFL E( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */

	488 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */

	489 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );

	490 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );

	491 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );

	492

	493 /* add at last */

	494 /* equal shift right 8 bytes*/

	495 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0 , 3, 2 ) );

	496 tmpb = _mm_add_epi32( tmpb, tmpa );

	497 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb );

	498 }

	499 else

	500 {

	501 /* add at last */

	502 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0 , 3, 2 ) ); /* equal shift right 8 bytes*/

	503 tmpb = _mm_add_epi32( tmpb, tmpa );

	504 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb );

	505

	506 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a _Q12[ 8 ] );

	507 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a _Q12[ 9 ] );

	508 }

	509

	510 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */

	511

	512 /* Noise shape feedback */

	513 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that ord er is even */

	514 /* Output of lowpass section */

	515 tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping _Q16 );

	516 /* Output of allpass section */

	517 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - t mp2, warping_Q16 );

	518 psDD->sAR2_Q14[ 0 ] = tmp2;

	519 n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );

	520 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );

	521 /* Loop over allpass sections */

	522 for( j = 2; j < shapingLPCOrder; j += 2 ) {

	523 /* Output of allpass section */

	524 tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );

	525 psDD->sAR2_Q14[ j - 1 ] = tmp1;

	526 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );

	527 /* Output of allpass section */

	528 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );

	529 psDD->sAR2_Q14[ j + 0 ] = tmp2;

	530 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );

	531 }

	532 psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;

	533 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOr der - 1 ] );

	534

	535 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 ); /* Q11 -> Q12 */

	536 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 ); /* Q12 */

	537 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 ); /* Q12 -> Q14 */

	538

	539 n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ smpl_buf_idx ], LF_shp _Q14 ); / Q12 */

	540 n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 ); /* Q12 */

	541 n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 ); /* Q12 -> Q14 */

	542

	543 /* Input minus prediction plus noise feedback */

	544 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_L TP */

	545 tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */

	546 tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */

	547 tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */

	548 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */

	549

	550 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */

	551

	552 /* Flip sign depending on dither */

	553 if ( psDD->Seed < 0 ) {

	554 r_Q10 = -r_Q10;

	555 }

	556 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );

	557

	558 /* Find two quantization level candidates and measure their rate -distortion */

	559 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );

	560 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );

	561 if( q1_Q0 > 0 ) {

	562 q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ ADJUST_Q10 );

	563 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );

	564 q2_Q10 = silk_ADD32( q1_Q10, 1024 );

	565 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );

	566 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );

	567 } else if( q1_Q0 == 0 ) {

	568 q1_Q10 = offset_Q10;

	569 q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );

	570 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );

	571 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );

	572 } else if( q1_Q0 == -1 ) {

	573 q2_Q10 = offset_Q10;

	574 q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );

	575 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );

	576 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );

	577 } else { /* q1_Q0 < -1 */

	578 q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ ADJUST_Q10 );

	579 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );

	580 q2_Q10 = silk_ADD32( q1_Q10, 1024 );

	581 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );

	582 rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );

	583 }

	584 rr_Q10 = silk_SUB32( r_Q10, q1_Q10 );

	585 rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 1 0 );

	586 rr_Q10 = silk_SUB32( r_Q10, q2_Q10 );

	587 rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 1 0 );

	588

	589 if( rd1_Q10 < rd2_Q10 ) {

	590 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );

	591 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );

	592 psSS[ 0 ].Q_Q10 = q1_Q10;

	593 psSS[ 1 ].Q_Q10 = q2_Q10;

	594 } else {

	595 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );

	596 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );

	597 psSS[ 0 ].Q_Q10 = q2_Q10;

	598 psSS[ 1 ].Q_Q10 = q1_Q10;

	599 }

	600

	601 /* Update states for best quantization */

	602

	603 /* Quantized excitation */

	604 exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );

	605 if ( psDD->Seed < 0 ) {

	606 exc_Q14 = -exc_Q14;

	607 }

	608

	609 /* Add predictions */

	610 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );

	611 xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );

	612

	613 /* Update states */

	614 sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );

	615 psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );

	616 psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14;

	617 psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14;

	618 psSS[ 0 ].xq_Q14 = xq_Q14;

	619

	620 /* Update states for second best quantization */

	621

	622 /* Quantized excitation */

	623 exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );

	624 if ( psDD->Seed < 0 ) {

	625 exc_Q14 = -exc_Q14;

	626 }

	627

	628

	629 /* Add predictions */

	630 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );

	631 xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );

	632

	633 /* Update states */

	634 sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );

	635 psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );

	636 psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14;

	637 psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14;

	638 psSS[ 1 ].xq_Q14 = xq_Q14;

	639 }

	640 }

	641 smpl_buf_idx = ( smpl_buf_idx - 1 ) & DECISION_DELAY_MASK; /* Index to newest samples */

	642 last_smple_idx = ( smpl_buf_idx + decisionDelay ) & DECISION_DELAY_MASK ; / Index to decisionDelay old samples */

	643

	644 /* Find winner */

	645 RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;

	646 Winner_ind = 0;

	647 for( k = 1; k < nStatesDelayedDecision; k++ ) {

	648 if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {

	649 RDmin_Q10 = psSampleState[ k ][ 0 ].RD_Q10;

	650 Winner_ind = k;

	651 }

	652 }

	653

	654 /* Increase RD values of expired states */

	655 Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];

	656 for( k = 0; k < nStatesDelayedDecision; k++ ) {

	657 if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {

	658 psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );

	659 psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );

	660 silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );

	661 }

	662 }

	663

	664 /* Find worst in first set and best in second set */

	665 RDmax_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;

	666 RDmin_Q10 = psSampleState[ 0 ][ 1 ].RD_Q10;

	667 RDmax_ind = 0;

	668 RDmin_ind = 0;

	669 for( k = 1; k < nStatesDelayedDecision; k++ ) {

	670 /* find worst in first set */

	671 if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {

	672 RDmax_Q10 = psSampleState[ k ][ 0 ].RD_Q10;

	673 RDmax_ind = k;

	674 }

	675 /* find best in second set */

	676 if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {

	677 RDmin_Q10 = psSampleState[ k ][ 1 ].RD_Q10;

	678 RDmin_ind = k;

	679 }

	680 }

	681

	682 /* Replace a state if best from second set outperforms worst in first se t */

	683 if( RDmin_Q10 < RDmax_Q10 ) {

	684 silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,

	685 ( (opus_int32 )&psDelDec[ RDmin_ind ] ) + i, sizeof( N SQ_del_dec_struct ) - i sizeof( opus_int32) );

	686 silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin _ind ][ 1 ], sizeof( NSQ_sample_struct ) );

	687 }

	688

	689 /* Write samples from winner to output and long-term filter states */

	690 psDD = &psDelDec[ Winner_ind ];

	691 if( subfr > 0 \|\| i >= decisionDelay ) {

	692 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q _Q10[ last_smple_idx ], 10 );

	693 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(

	694 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ la st_smple_idx ] ), 8 ) );

	695 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->S hape_Q14[ last_smple_idx ];

	696 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDD->P red_Q15[ last_smple_idx ];

	697 }

	698 NSQ->sLTP_shp_buf_idx++;

	699 NSQ->sLTP_buf_idx++;

	700

	701 /* Update states */

	702 for( k = 0; k < nStatesDelayedDecision; k++ ) {

	703 psDD = &psDelDec[ k ];

	704 psSS = &psSampleState[ k ][ 0 ];

	705 psDD->LF_AR_Q14 = psSS->LF_AR_Q14;

	706 psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;

	707 psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14;

	708 psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10;

	709 psDD->Pred_Q15[ *smpl_buf_idx ] = silk_LSHIFT32( psSS->LPC_ exc_Q14, 1 );

	710 psDD->Shape_Q14[ *smpl_buf_idx ] = psSS->sLTP_shp_Q14;

	711 psDD->Seed = silk_ADD32_ovflw( psDD->S eed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );

	712 psDD->RandState[ *smpl_buf_idx ] = psDD->Seed;

	713 psDD->RD_Q10 = psSS->RD_Q10;

	714 }

	715 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;

	716 }

	717 /* Update LPC states */

	718 for( k = 0; k < nStatesDelayedDecision; k++ ) {

	719 psDD = &psDelDec[ k ];

	720 silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENG TH * sizeof( opus_int32 ) );

	721 }

	722 RESTORE_STACK;

	723 }

	724

	725 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(

	726 const silk_encoder_state psEncC, / I Encoder State */

	727 silk_nsq_state NSQ, / I/O NSQ state */

	728 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision sta tes */

	729 const opus_int32 x_Q3[], /* I Input in Q3 */

	730 opus_int32 x_sc_Q10[], /* O Input scaled with 1/ Gain in Q10 */

	731 const opus_int16 sLTP[], /* I Re-whitened LTP stat e in Q0 */

	732 opus_int32 sLTP_Q15[], /* O LTP state matching s caled input */

	733 opus_int subfr, /* I Subframe number */

	734 opus_int nStatesDelayedDecision, /* I Number of del dec st ates */

	735 const opus_int LTP_scale_Q14, /* I LTP state scaling */

	736 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */

	737 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */

	738 const opus_int signal_type, /* I Signal type */

	739 const opus_int decisionDelay /* I Decision delay */

	740 )

	741 {

	742 opus_int i, k, lag;

	743 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;

	744 NSQ_del_dec_struct *psDD;

	745 __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;

	746

	747 lag = pitchL[ subfr ];

	748 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );

	749

	750 silk_assert( inv_gain_Q31 != 0 );

	751

	752 /* Calculate gain adjustment factor */

	753 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {

	754 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );

	755 } else {

	756 gain_adj_Q16 = (opus_int32)1 << 16;

	757 }

	758

	759 /* Scale input */

	760 inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );

	761

	762 /* prepare inv_gain_Q23 in packed 4 32-bits */

	763 xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);

	764

	765 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {

	766 xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );

	767 /* equal shift right 4 bytes*/

	768 xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );

	769

	770 xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );

	771 xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );

	772

	773 xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );

	774 xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );

	775

	776 xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );

	777

	778 _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );

	779 }

	780

	781 for( ; i < psEncC->subfr_length; i++ ) {

	782 x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );

	783 }

	784

	785 /* Save inverse gain */

	786 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];

	787

	788 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */

	789 if( NSQ->rewhite_flag ) {

	790 if( subfr == 0 ) {

	791 /* Do LTP downscaling */

	792 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );

	793 }

	794 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {

	795 silk_assert( i < MAX_FRAME_LENGTH );

	796 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );

	797 }

	798 }

	799

	800 /* Adjust for changing gain */

	801 if( gain_adj_Q16 != (opus_int32)1 << 16 ) {

	802 /* Scale long-term shaping state */

	803 {

	804 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3 x1;

	805

	806 /* prepare gain_adj_Q16 in packed 4 32-bits */

	807 xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 );

	808

	809 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sL TP_shp_buf_idx - 3; i += 4 )

	810 {

	811 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP _shp_Q14[ i ] ) ) );

	812 /* equal shift right 4 bytes*/

	813 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0 , _MM_SHUFFLE( 0, 3, 2, 1 ) );

	814

	815 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xm m_gain_adj_Q16 );

	816 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xm m_gain_adj_Q16 );

	817

	818 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 1 6 );

	819 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 1 6 );

	820

	821 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );

	822

	823 _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_ sLTP_shp_Q14_x2x0 );

	824 }

	825

	826 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {

	827 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_sh p_Q14[ i ] );

	828 }

	829

	830 /* Scale long-term prediction state */

	831 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {

	832 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_ buf_idx - decisionDelay; i++ ) {

	833 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );

	834 }

	835 }

	836

	837 for( k = 0; k < nStatesDelayedDecision; k++ ) {

	838 psDD = &psDelDec[ k ];

	839

	840 /* Scale scalar states */

	841 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );

	842

	843 /* Scale short-term prediction and shaping states */

	844 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {

	845 psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_ Q14[ i ] );

	846 }

	847 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {

	848 psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_ Q14[ i ] );

	849 }

	850 for( i = 0; i < DECISION_DELAY; i++ ) {

	851 psDD->Pred_Q15[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred _Q15[ i ] );

	852 psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shap e_Q14[ i ] );

	853 }

	854 }

	855 }

	856 }

	857 }

OLD	NEW

« no previous file with comments | « third_party/opus/src/silk/typedef.h ('k') | third_party/opus/src/silk/x86/NSQ_sse.c » ('j') | no next file with comments »