third_party/opus/src/silk/arm/NSQ_del_dec_neon_intr.c - Issue 2962373002: [Opus] Update to v1.2.1

Side by Side Diff: third_party/opus/src/silk/arm/NSQ_del_dec_neon_intr.c

Issue 2962373002: [Opus] Update to v1.2.1 (Closed)

Patch Set: Pre-increment instead of post-increment Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 /***********************************************************************

	2 Copyright (c) 2017 Google Inc.

	3 Redistribution and use in source and binary forms, with or without

	4 modification, are permitted provided that the following conditions

	5 are met:

	6 - Redistributions of source code must retain the above copyright notice,

	7 this list of conditions and the following disclaimer.

	8 - Redistributions in binary form must reproduce the above copyright

	9 notice, this list of conditions and the following disclaimer in the

	10 documentation and/or other materials provided with the distribution.

	11 - Neither the name of Internet Society, IETF or IETF Trust, nor the

	12 names of specific contributors, may be used to endorse or promote

	13 products derived from this software without specific prior written

	14 permission.

	15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

	16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

	17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

	18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

	19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

	20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

	21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

	22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

	23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

	24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

	25 POSSIBILITY OF SUCH DAMAGE.

	26 ***********************************************************************/

	27

	28 #ifdef HAVE_CONFIG_H

	29 #include "config.h"

	30 #endif

	31

	32 #include <arm_neon.h>

	33 #ifdef OPUS_CHECK_ASM

	34 # include <string.h>

	35 #endif

	36 #include "main.h"

	37 #include "stack_alloc.h"

	38

	39 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */

	40 /* If there are more states, C function is called, and this optimization must be expanded. */

	41 #define NEON_MAX_DEL_DEC_STATES 4

	42

	43 typedef struct {

	44 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ][ NEON_MAX_D EL_DEC_STATES ];

	45 opus_int32 RandState[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];

	46 opus_int32 Q_Q10[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];

	47 opus_int32 Xq_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];

	48 opus_int32 Pred_Q15[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];

	49 opus_int32 Shape_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];

	50 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ][ NEON_MAX_DEL_DEC_STATES ];

	51 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];

	52 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ];

	53 opus_int32 Seed[ NEON_MAX_DEL_DEC_STATES ];

	54 opus_int32 SeedInit[ NEON_MAX_DEL_DEC_STATES ];

	55 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ];

	56 } NSQ_del_decs_struct;

	57

	58 typedef struct {

	59 opus_int32 Q_Q10[ NEON_MAX_DEL_DEC_STATES ];

	60 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ];

	61 opus_int32 xq_Q14[ NEON_MAX_DEL_DEC_STATES ];

	62 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];

	63 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ];

	64 opus_int32 sLTP_shp_Q14[ NEON_MAX_DEL_DEC_STATES ];

	65 opus_int32 LPC_exc_Q14[ NEON_MAX_DEL_DEC_STATES ];

	66 } NSQ_samples_struct;

	67

	68 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(

	69 const silk_encoder_state psEncC, / I Encoder State */

	70 silk_nsq_state NSQ, / I/O NSQ state */

	71 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision sta tes */

	72 const opus_int16 x16[], /* I Input */

	73 opus_int32 x_sc_Q10[], /* O Input scaled with 1/ Gain in Q10 */

	74 const opus_int16 sLTP[], /* I Re-whitened LTP stat e in Q0 */

	75 opus_int32 sLTP_Q15[], /* O LTP state matching s caled input */

	76 opus_int subfr, /* I Subframe number */

	77 const opus_int LTP_scale_Q14, /* I LTP state scaling */

	78 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */

	79 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */

	80 const opus_int signal_type, /* I Signal type */

	81 const opus_int decisionDelay /* I Decision delay */

	82 );

	83

	84 /******************************************/

	85 /* Noise shape quantizer for one subframe */

	86 /******************************************/

	87 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(

	88 silk_nsq_state NSQ, / I/O NSQ state */

	89 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */

	90 opus_int signalType, /* I Signal type */

	91 const opus_int32 x_Q10[], /* I */

	92 opus_int8 pulses[], /* O */

	93 opus_int16 xq[], /* O */

	94 opus_int32 sLTP_Q15[], /* I/O LTP filter state */

	95 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */

	96 const opus_int16 a_Q12[], /* I Short term prediction co efs */

	97 const opus_int16 b_Q14[], /* I Long term prediction coe fs */

	98 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */

	99 opus_int lag, /* I Pitch lag */

	100 opus_int32 HarmShapeFIRPacked_Q14, /* I */

	101 opus_int Tilt_Q14, /* I Spectral tilt */

	102 opus_int32 LF_shp_Q14, /* I */

	103 opus_int32 Gain_Q16, /* I */

	104 opus_int Lambda_Q10, /* I */

	105 opus_int offset_Q10, /* I */

	106 opus_int length, /* I Input length */

	107 opus_int subfr, /* I Subframe number */

	108 opus_int shapingLPCOrder, /* I Shaping LPC filter order */

	109 opus_int predictLPCOrder, /* I Prediction filter order */

	110 opus_int warping_Q16, /* I */

	111 opus_int nStatesDelayedDecision, /* I Number of states in deci sion tree */

	112 opus_int smpl_buf_idx, / I/O Index to newest samples in buffers */

	113 opus_int decisionDelay /* I */

	114 );

	115

	116 static OPUS_INLINE void copy_winner_state_kernel(

	117 const NSQ_del_decs_struct *psDelDec,

	118 const opus_int offset,

	119 const opus_int last_smple_idx,

	120 const opus_int Winner_ind,

	121 const int32x2_t gain_lo_s32x2,

	122 const int32x2_t gain_hi_s32x2,

	123 const int32x4_t shift_s32x4,

	124 int32x4_t t0_s32x4,

	125 int32x4_t t1_s32x4,

	126 opus_int8 *const pulses,

	127 opus_int16 *pxq,

	128 silk_nsq_state *NSQ

	129 )

	130 {

	131 int16x8_t t_s16x8;

	132 int32x4_t o0_s32x4, o1_s32x4;

	133

	134 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 0 ][ Winner_in d ], t0_s32x4, 0 );

	135 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 1 ][ Winner_in d ], t0_s32x4, 1 );

	136 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 2 ][ Winner_in d ], t0_s32x4, 2 );

	137 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 3 ][ Winner_in d ], t0_s32x4, 3 );

	138 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 4 ][ Winner_in d ], t1_s32x4, 0 );

	139 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 5 ][ Winner_in d ], t1_s32x4, 1 );

	140 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 6 ][ Winner_in d ], t1_s32x4, 2 );

	141 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 7 ][ Winner_in d ], t1_s32x4, 3 );

	142 t_s16x8 = vcombine_s16( vrshrn_n_s32( t0_s32x4, 10 ), vrshrn_n_s32( t1_s32x4 , 10 ) );

	143 vst1_s8( &pulses[ offset ], vmovn_s16( t_s16x8 ) );

	144

	145 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 0 ][ Winner_i nd ], t0_s32x4, 0 );

	146 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 1 ][ Winner_i nd ], t0_s32x4, 1 );

	147 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 2 ][ Winner_i nd ], t0_s32x4, 2 );

	148 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 3 ][ Winner_i nd ], t0_s32x4, 3 );

	149 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 4 ][ Winner_i nd ], t1_s32x4, 0 );

	150 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 5 ][ Winner_i nd ], t1_s32x4, 1 );

	151 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 6 ][ Winner_i nd ], t1_s32x4, 2 );

	152 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 7 ][ Winner_i nd ], t1_s32x4, 3 );

	153 o0_s32x4 = vqdmulhq_lane_s32( t0_s32x4, gain_lo_s32x2, 0 );

	154 o1_s32x4 = vqdmulhq_lane_s32( t1_s32x4, gain_lo_s32x2, 0 );

	155 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, t0_s32x4, gain_hi_s32x2, 0 );

	156 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, t1_s32x4, gain_hi_s32x2, 0 );

	157 o0_s32x4 = vrshlq_s32( o0_s32x4, shift_s32x4 );

	158 o1_s32x4 = vrshlq_s32( o1_s32x4, shift_s32x4 );

	159 vst1_s16( &pxq[ offset + 0 ], vqmovn_s32( o0_s32x4 ) );

	160 vst1_s16( &pxq[ offset + 4 ], vqmovn_s32( o1_s32x4 ) );

	161

	162 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 0 ][ Winne r_ind ], t0_s32x4, 0 );

	163 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 1 ][ Winne r_ind ], t0_s32x4, 1 );

	164 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 2 ][ Winne r_ind ], t0_s32x4, 2 );

	165 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 3 ][ Winne r_ind ], t0_s32x4, 3 );

	166 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 4 ][ Winne r_ind ], t1_s32x4, 0 );

	167 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 5 ][ Winne r_ind ], t1_s32x4, 1 );

	168 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 6 ][ Winne r_ind ], t1_s32x4, 2 );

	169 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 7 ][ Winne r_ind ], t1_s32x4, 3 );

	170 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 0 ], t0_s32x 4 );

	171 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 4 ], t1_s32x 4 );

	172 }

	173

	174 static OPUS_INLINE void copy_winner_state(

	175 const NSQ_del_decs_struct *psDelDec,

	176 const opus_int decisionDelay,

	177 const opus_int smpl_buf_idx,

	178 const opus_int Winner_ind,

	179 const opus_int32 gain,

	180 const opus_int32 shift,

	181 opus_int8 *const pulses,

	182 opus_int16 *pxq,

	183 silk_nsq_state *NSQ

	184 )

	185 {

	186 opus_int i, last_smple_idx;

	187 const int32x2_t gain_lo_s32x2 = vdup_n_s32( silk_LSHIFT32( gain & 0x0000FFFF , 15 ) );

	188 const int32x2_t gain_hi_s32x2 = vdup_n_s32( gain >> 16 );

	189 const int32x4_t shift_s32x4 = vdupq_n_s32( -shift );

	190 int32x4_t t0_s32x4, t1_s32x4;

	191

	192 t0_s32x4 = t1_s32x4 = vdupq_n_s32( 0 ); /* initialization */

	193 last_smple_idx = smpl_buf_idx + decisionDelay - 1 + DECISION_DELAY;

	194 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;

	195 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;

	196

	197 for( i = 0; ( i < ( decisionDelay - 7 ) ) && ( last_smple_idx >= 7 ); i += 8 , last_smple_idx -= 8 ) {

	198 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, W inner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses , pxq, NSQ );

	199 }

	200 for( ; ( i < decisionDelay ) && ( last_smple_idx >= 0 ); i++, last_smple_idx -- ) {

	201 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_ Q10[ last_smple_idx ][ Winner_ind ], 10 );

	202 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( si lk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );

	203 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDe c->Shape_Q14[ last_smple_idx ][ Winner_ind ];

	204 }

	205

	206 last_smple_idx += DECISION_DELAY;

	207 for( ; i < ( decisionDelay - 7 ); i++, last_smple_idx-- ) {

	208 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, W inner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses , pxq, NSQ );

	209 }

	210 for( ; i < decisionDelay; i++, last_smple_idx-- ) {

	211 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_ Q10[ last_smple_idx ][ Winner_ind ], 10 );

	212 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( si lk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );

	213 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDe c->Shape_Q14[ last_smple_idx ][ Winner_ind ];

	214 }

	215 }

	216

	217 void silk_NSQ_del_dec_neon(

	218 const silk_encoder_state psEncC, / I Encoder State */

	219 silk_nsq_state NSQ, / I /O NSQ state */

	220 SideInfoIndices psIndices, / I /O Quantization Indices */

	221 const opus_int16 x16[], /* I Input */

	222 opus_int8 pulses[], /* O Quantized pulse signal */

	223 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */

	224 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */

	225 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */

	226 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */

	227 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */

	228 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */

	229 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */

	230 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */

	231 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */

	232 const opus_int LTP_scale_Q14 /* I LTP state scaling */

	233 )

	234 {

	235 #ifdef OPUS_CHECK_ASM

	236 silk_nsq_state NSQ_c;

	237 SideInfoIndices psIndices_c;

	238 opus_int8 pulses_c[ MAX_FRAME_LENGTH ];

	239 const opus_int8 *const pulses_a = pulses;

	240

	241 ( void )pulses_a;

	242 silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );

	243 silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );

	244 silk_memcpy( pulses_c, pulses, sizeof( pulses_c ) );

	245 silk_NSQ_del_dec_c( psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q1 2, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,

	246 pitchL, Lambda_Q10, LTP_scale_Q14 );

	247 #endif

	248

	249 /* The optimization parallelizes the different delay decision states. */

	250 if(( psEncC->nStatesDelayedDecision > NEON_MAX_DEL_DEC_STATES ) \|\| ( psEncC- >nStatesDelayedDecision <= 2 )) {

	251 /* NEON intrinsics optimization now can only parallelize up to 4 delay d ecision states. */

	252 /* If there are more states, C function is called, and this optimization must be expanded. */

	253 /* When the number of delay decision states is less than 3, there are pe nalties using this */

	254 /* optimization, and C function is called. */

	255 /* When the number of delay decision states is 2, it's better to special ize another */

	256 /* structure NSQ_del_dec2_struct and optimize with shorter NEON register s. (Low priority) */

	257 silk_NSQ_del_dec_c( psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, L TPCoef_Q14, AR_Q13, HarmShapeGain_Q14,

	258 Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14 ) ;

	259 } else {

	260 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner _ind, subfr;

	261 opus_int smpl_buf_idx, decisionDelay;

	262 const opus_int16 A_Q12, B_Q14, *AR_shp_Q13;

	263 opus_int16 *pxq;

	264 VARDECL( opus_int32, sLTP_Q15 );

	265 VARDECL( opus_int16, sLTP );

	266 opus_int32 HarmShapeFIRPacked_Q14;

	267 opus_int offset_Q10;

	268 opus_int32 RDmin_Q10, Gain_Q10;

	269 VARDECL( opus_int32, x_sc_Q10 );

	270 VARDECL( opus_int32, delayedGain_Q10 );

	271 VARDECL( NSQ_del_decs_struct, psDelDec );

	272 int32x4_t t_s32x4;

	273 SAVE_STACK;

	274

	275 /* Set unvoiced lag to the previous one, overwrite later for voiced */

	276 lag = NSQ->lagPrev;

	277

	278 silk_assert( NSQ->prev_gain_Q16 != 0 );

	279

	280 /* Initialize delayed decision states */

	281 ALLOC( psDelDec, 1, NSQ_del_decs_struct );

	282 /* Only RandState and RD_Q10 need to be initialized to 0. */

	283 silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );

	284 vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );

	285

	286 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {

	287 psDelDec->SeedInit[ k ] = psDelDec->Seed[ k ] = ( k + psIndices->See d ) & 3;

	288 }

	289 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_dup_s32( &NSQ->sLF_AR_shp_Q14 ) );

	290 vst1q_s32( psDelDec->Diff_Q14, vld1q_dup_s32( &NSQ->sDiff_shp_Q14 ) );

	291 vst1q_s32( psDelDec->Shape_Q14[ 0 ], vld1q_dup_s32( &NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ] ) );

	292 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {

	293 vst1q_s32( psDelDec->sLPC_Q14[ i ], vld1q_dup_s32( &NSQ->sLPC_Q14[ i ] ) );

	294 }

	295 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_ Q14[ 0 ] ) ); i++ ) {

	296 vst1q_s32( psDelDec->sAR2_Q14[ i ], vld1q_dup_s32( &NSQ->sAR2_Q14[ i ] ) );

	297 }

	298

	299 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];

	300 smpl_buf_idx = 0; /* index of oldest samples */

	301

	302 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );

	303

	304 /* For voiced frames limit the decision delay to lower than the pitch la g */

	305 if( psIndices->signalType == TYPE_VOICED ) {

	306 opus_int pitch_min = pitchL[ 0 ];

	307 for( k = 1; k < psEncC->nb_subfr; k++ ) {

	308 pitch_min = silk_min_int( pitch_min, pitchL[ k ] );

	309 }

	310 decisionDelay = silk_min_int( decisionDelay, pitch_min - LTP_ORDER / 2 - 1 );

	311 } else {

	312 if( lag > 0 ) {

	313 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );

	314 }

	315 }

	316

	317 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {

	318 LSF_interpolation_flag = 0;

	319 } else {

	320 LSF_interpolation_flag = 1;

	321 }

	322

	323 ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int 32 );

	324 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ) ;

	325 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );

	326 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );

	327 /* Set up pointers to start of sub frame */

	328 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];

	329 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;

	330 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;

	331 subfr = 0;

	332 for( k = 0; k < psEncC->nb_subfr; k++ ) {

	333 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) \| ( 1 - LSF_interpolation_f lag ) ) * MAX_LPC_ORDER ];

	334 B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];

	335 AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];

	336

	337 /* Noise shape parameters */

	338 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );

	339 HarmShapeFIRPacked_Q14 = silk_RSHIFT( Harm ShapeGain_Q14[ k ], 2 );

	340 HarmShapeFIRPacked_Q14 \|= silk_LSHIFT( (opus_int32)silk_RSHIFT( Harm ShapeGain_Q14[ k ], 1 ), 16 );

	341

	342 NSQ->rewhite_flag = 0;

	343 if( psIndices->signalType == TYPE_VOICED ) {

	344 /* Voiced */

	345 lag = pitchL[ k ];

	346

	347 /* Re-whitening */

	348 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {

	349 if( k == 2 ) {

	350 /* RESET DELAYED DECISIONS */

	351 /* Find winner */

	352 int32x4_t RD_Q10_s32x4;

	353 RDmin_Q10 = psDelDec->RD_Q10[ 0 ];

	354 Winner_ind = 0;

	355 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {

	356 if( psDelDec->RD_Q10[ i ] < RDmin_Q10 ) {

	357 RDmin_Q10 = psDelDec->RD_Q10[ i ];

	358 Winner_ind = i;

	359 }

	360 }

	361 psDelDec->RD_Q10[ Winner_ind ] -= ( silk_int32_MAX >> 4 );

	362 RD_Q10_s32x4 = vld1q_s32( psDelDec->RD_Q10 );

	363 RD_Q10_s32x4 = vaddq_s32( RD_Q10_s32x4, vdupq_n_s32( sil k_int32_MAX >> 4 ) );

	364 vst1q_s32( psDelDec->RD_Q10, RD_Q10_s32x4 );

	365

	366 /* Copy final part of signals from winner state to outpu t and long-term filter states */

	367 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx , Winner_ind, Gains_Q16[ 1 ], 14, pulses, pxq, NSQ );

	368

	369 subfr = 0;

	370 }

	371

	372 /* Rewhiten with new A coefs */

	373 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLP COrder - LTP_ORDER / 2;

	374 silk_assert( start_idx > 0 );

	375

	376 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ star t_idx + k * psEncC->subfr_length ],

	377 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predi ctLPCOrder, psEncC->arch );

	378

	379 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;

	380 NSQ->rewhite_flag = 1;

	381 }

	382 }

	383

	384 silk_nsq_del_dec_scale_states_neon( psEncC, NSQ, psDelDec, x16, x_sc _Q10, sLTP, sLTP_Q15, k,

	385 LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisio nDelay );

	386

	387 silk_noise_shape_quantizer_del_dec_neon( NSQ, psDelDec, psIndices->s ignalType, x_sc_Q10, pulses, pxq, sLTP_Q15,

	388 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPack ed_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],

	389 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, su bfr++, psEncC->shapingLPCOrder,

	390 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDel ayedDecision, &smpl_buf_idx, decisionDelay );

	391

	392 x16 += psEncC->subfr_length;

	393 pulses += psEncC->subfr_length;

	394 pxq += psEncC->subfr_length;

	395 }

	396

	397 /* Find winner */

	398 RDmin_Q10 = psDelDec->RD_Q10[ 0 ];

	399 Winner_ind = 0;

	400 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {

	401 if( psDelDec->RD_Q10[ k ] < RDmin_Q10 ) {

	402 RDmin_Q10 = psDelDec->RD_Q10[ k ];

	403 Winner_ind = k;

	404 }

	405 }

	406

	407 /* Copy final part of signals from winner state to output and long-term filter states */

	408 psIndices->Seed = psDelDec->SeedInit[ Winner_ind ];

	409 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );

	410 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Ga in_Q10, 8, pulses, pxq, NSQ );

	411

	412 t_s32x4 = vdupq_n_s32( 0 ); /* initialization */

	413 for( i = 0; i < ( NSQ_LPC_BUF_LENGTH - 3 ); i += 4 ) {

	414 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 0 ][ Winner_ind ] , t_s32x4, 0 );

	415 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 1 ][ Winner_ind ] , t_s32x4, 1 );

	416 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 2 ][ Winner_ind ] , t_s32x4, 2 );

	417 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 3 ][ Winner_ind ] , t_s32x4, 3 );

	418 vst1q_s32( &NSQ->sLPC_Q14[ i ], t_s32x4 );

	419 }

	420

	421 for( ; i < NSQ_LPC_BUF_LENGTH; i++ ) {

	422 NSQ->sLPC_Q14[ i ] = psDelDec->sLPC_Q14[ i ][ Winner_ind ];

	423 }

	424

	425 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_ Q14[ 0 ] ) - 3 ); i += 4 ) {

	426 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 0 ][ Winner_ind ] , t_s32x4, 0 );

	427 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 1 ][ Winner_ind ] , t_s32x4, 1 );

	428 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 2 ][ Winner_ind ] , t_s32x4, 2 );

	429 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 3 ][ Winner_ind ] , t_s32x4, 3 );

	430 vst1q_s32( &NSQ->sAR2_Q14[ i ], t_s32x4 );

	431 }

	432

	433 for( ; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {

	434 NSQ->sAR2_Q14[ i ] = psDelDec->sAR2_Q14[ i ][ Winner_ind ];

	435 }

	436

	437 /* Update states */

	438 NSQ->sLF_AR_shp_Q14 = psDelDec->LF_AR_Q14[ Winner_ind ];

	439 NSQ->sDiff_shp_Q14 = psDelDec->Diff_Q14[ Winner_ind ];

	440 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];

	441

	442 /* Save quantized speech signal */

	443 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_lengt h ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );

	444 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_lengt h ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );

	445 RESTORE_STACK;

	446 }

	447

	448 #ifdef OPUS_CHECK_ASM

	449 silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );

	450 silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );

	451 silk_assert( !memcmp( pulses_c, pulses_a, sizeof( pulses_c ) ) );

	452 #endif

	453 }

	454

	455 /******************************************/

	456 /* Noise shape quantizer for one subframe */

	457 /******************************************/

	458 /* Note: Function silk_short_prediction_create_arch_coef_neon() defined in NSQ_n eon.h is actually a hacking C function. */

	459 /* Therefore here we append "_local" to the NEON function name to avoid co nfusion. */

	460 static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon_local(opus_i nt32 out, const opus_int16 in, opus_int order)

	461 {

	462 int16x8_t t_s16x8;

	463 int32x4_t t0_s32x4, t1_s32x4, t2_s32x4, t3_s32x4;

	464 silk_assert( order == 10 \|\| order == 16 );

	465

	466 t_s16x8 = vld1q_s16( in + 0 ); /* 7 6 5 4 3 2 1 0 */

	467 t_s16x8 = vrev64q_s16( t_s16x8 ); /* 4 5 6 7 0 1 2 3 */

	468 t2_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* 4 5 6 7 */

	469 t3_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 0 1 2 3 */

	470

	471 if( order == 16 ) {

	472 t_s16x8 = vld1q_s16( in + 8 ); /* F E D C B A 9 8 */

	473 t_s16x8 = vrev64q_s16( t_s16x8 ); /* C D E F 8 9 A B */

	474 t0_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* C D E F */

	475 t1_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 8 9 A B */

	476 } else {

	477 int16x4_t t_s16x4;

	478

	479 t0_s32x4 = vdupq_n_s32( 0 ); /* zero zero zero zero */

	480 t_s16x4 = vld1_s16( in + 6 ); /* 9 8 7 6 */

	481 t_s16x4 = vrev64_s16( t_s16x4 ); /* 6 7 8 9 */

	482 t1_s32x4 = vshll_n_s16( t_s16x4, 15 );

	483 t1_s32x4 = vcombine_s32( vget_low_s32(t0_s32x4), vget_low_s32( t1_s32x4 ) ); /* 8 9 zero zero */

	484 }

	485 vst1q_s32( out + 0, t0_s32x4 );

	486 vst1q_s32( out + 4, t1_s32x4 );

	487 vst1q_s32( out + 8, t2_s32x4 );

	488 vst1q_s32( out + 12, t3_s32x4 );

	489 }

	490

	491 static OPUS_INLINE int32x4_t silk_SMLAWB_lane0_neon(

	492 const int32x4_t out_s32x4,

	493 const int32x4_t in_s32x4,

	494 const int32x2_t coef_s32x2

	495 )

	496 {

	497 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 0 ) );

	498 }

	499

	500 static OPUS_INLINE int32x4_t silk_SMLAWB_lane1_neon(

	501 const int32x4_t out_s32x4,

	502 const int32x4_t in_s32x4,

	503 const int32x2_t coef_s32x2

	504 )

	505 {

	506 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 1 ) );

	507 }

	508

	509 /* Note: This function has different return value than silk_noise_shape_quantize r_short_prediction_neon(). */

	510 /* Therefore here we append "_local" to the function name to avoid confusi on. */

	511 static OPUS_INLINE int32x4_t silk_noise_shape_quantizer_short_prediction_neon_lo cal(const opus_int32 buf32, const opus_int32 a_Q12_arch, opus_int order)

	512 {

	513 const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 );

	514 const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 );

	515 const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 );

	516 const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 );

	517 int32x4_t LPC_pred_Q14_s32x4;

	518

	519 silk_assert( order == 10 \|\| order == 16 );

	520 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */

	521 LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( order, 1 ) );

	522 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 0 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) );

	523 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 1 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) );

	524 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 2 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );

	525 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 3 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );

	526 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 4 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) );

	527 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 5 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) );

	528 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 6 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );

	529 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 7 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );

	530 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 8 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) );

	531 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 9 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) );

	532 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );

	533 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );

	534 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) );

	535 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) );

	536 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );

	537 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );

	538

	539 return LPC_pred_Q14_s32x4;

	540 }

	541

	542 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(

	543 silk_nsq_state NSQ, / I/O NSQ state */

	544 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */

	545 opus_int signalType, /* I Signal type */

	546 const opus_int32 x_Q10[], /* I */

	547 opus_int8 pulses[], /* O */

	548 opus_int16 xq[], /* O */

	549 opus_int32 sLTP_Q15[], /* I/O LTP filter state */

	550 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */

	551 const opus_int16 a_Q12[], /* I Short term prediction co efs */

	552 const opus_int16 b_Q14[], /* I Long term prediction coe fs */

	553 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */

	554 opus_int lag, /* I Pitch lag */

	555 opus_int32 HarmShapeFIRPacked_Q14, /* I */

	556 opus_int Tilt_Q14, /* I Spectral tilt */

	557 opus_int32 LF_shp_Q14, /* I */

	558 opus_int32 Gain_Q16, /* I */

	559 opus_int Lambda_Q10, /* I */

	560 opus_int offset_Q10, /* I */

	561 opus_int length, /* I Input length */

	562 opus_int subfr, /* I Subframe number */

	563 opus_int shapingLPCOrder, /* I Shaping LPC filter order */

	564 opus_int predictLPCOrder, /* I Prediction filter order */

	565 opus_int warping_Q16, /* I */

	566 opus_int nStatesDelayedDecision, /* I Number of states in deci sion tree */

	567 opus_int smpl_buf_idx, / I/O Index to newest samples in buffers */

	568 opus_int decisionDelay /* I */

	569 )

	570 {

	571 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;

	572 opus_int32 Winner_rand_state;

	573 opus_int32 LTP_pred_Q14, n_LTP_Q14;

	574 opus_int32 RDmin_Q10, RDmax_Q10;

	575 opus_int32 Gain_Q10;

	576 opus_int32 pred_lag_ptr, shp_lag_ptr;

	577 opus_int32 a_Q12_arch[MAX_LPC_ORDER];

	578 const int32x2_t warping_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( warping_Q16, 16 ) >> 1 );

	579 const opus_int32 LF_shp_Q29 = silk_LSHIFT32( LF_shp_Q14, 16 ) >> 1;

	580 opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ];

	581 const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER );

	582 const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT );

	583

	584 VARDECL( NSQ_samples_struct, psSampleState );

	585 SAVE_STACK;

	586

	587 silk_assert( nStatesDelayedDecision > 0 );

	588 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even * /

	589 ALLOC( psSampleState, 2, NSQ_samples_struct );

	590

	591 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_ FIR_TAPS / 2 ];

	592 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];

	593 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );

	594

	595 for( i = 0; i < ( MAX_SHAPE_LPC_ORDER - 7 ); i += 8 ) {

	596 const int16x8_t t_s16x8 = vld1q_s16( AR_shp_Q13 + i );

	597 vst1q_s32( AR_shp_Q28 + i + 0, vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ) );

	598 vst1q_s32( AR_shp_Q28 + i + 4, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) );

	599 }

	600

	601 for( ; i < MAX_SHAPE_LPC_ORDER; i++ ) {

	602 AR_shp_Q28[i] = silk_LSHIFT32( AR_shp_Q13[i], 15 );

	603 }

	604

	605 silk_short_prediction_create_arch_coef_neon_local( a_Q12_arch, a_Q12, predic tLPCOrder );

	606

	607 for( i = 0; i < length; i++ ) {

	608 int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4;

	609 int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4;

	610 int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4;

	611 int32x2_t AR_shp_Q28_s32x2;

	612 int16x4_t r_Q10_s16x4, rr_Q10_s16x4;

	613

	614 /* Perform common calculations used in all states */

	615

	616 /* Long-term prediction */

	617 if( signalType == TYPE_VOICED ) {

	618 /* Unrolled loop */

	619 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */

	620 LTP_pred_Q14 = 2;

	621 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ 0 ], b_Q14[ 0 ] );

	622 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] );

	623 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] );

	624 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] );

	625 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );

	626 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */

	627 pred_lag_ptr++;

	628 } else {

	629 LTP_pred_Q14 = 0;

	630 }

	631

	632 /* Long-term shaping */

	633 if( lag > 0 ) {

	634 /* Symmetric, packed FIR coefficients */

	635 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );

	636 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );

	637 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */

	638 shp_lag_ptr++;

	639 } else {

	640 n_LTP_Q14 = 0;

	641 }

	642

	643 /* Generate dither */

	644 Seed_s32x4 = vld1q_s32( psDelDec->Seed );

	645 Seed_s32x4 = vreinterpretq_s32_u32( vmlaq_u32( rand_increment_u32x4, vre interpretq_u32_s32( Seed_s32x4 ), rand_multiplier_u32x4 ) );

	646 vst1q_s32( psDelDec->Seed, Seed_s32x4 );

	647

	648 /* Short-term prediction */

	649 LPC_pred_Q14_s32x4 = silk_noise_shape_quantizer_short_prediction_neon_lo cal(psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 16 + i ], a_Q12_arch, predictLPCOrd er);

	650 LPC_pred_Q14_s32x4 = vshlq_n_s32( LPC_pred_Q14_s32x4, 4 ); /* Q10 -> Q14 */

	651

	652 /* Noise shape feedback */

	653 /* Output of lowpass section */

	654 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->Diff_Q14 ), vl d1q_s32( psDelDec->sAR2_Q14[ 0 ] ), warping_Q16_s32x2 );

	655 /* Output of allpass section */

	656 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ 1 ] ), tmp2_s32x4 );

	657 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );

	658 vst1q_s32( psDelDec->sAR2_Q14[ 0 ], tmp2_s32x4 );

	659 AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 );

	660 n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );

	661

	662 /* Loop over allpass sections */

	663 for( j = 2; j < shapingLPCOrder; j += 2 ) {

	664 /* Output of allpass section */

	665 tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tm p1_s32x4 );

	666 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 );

	667 vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 );

	668 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_ s32x4, AR_shp_Q28_s32x2, 1 ) );

	669 /* Output of allpass section */

	670 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tm p2_s32x4 );

	671 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );

	672 vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 );

	673 AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] );

	674 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_ s32x4, AR_shp_Q28_s32x2, 0 ) );

	675 }

	676 vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 );

	677 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x 4, AR_shp_Q28_s32x2, 1 ) );

	678 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 ); /* Q11 -> Q12 */

	679 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( p sDelDec->LF_AR_Q14 ), silk_LSHIFT32( Tilt_Q14, 16 ) >> 1 ) ); /* Q12 */

	680 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 ); /* Q12 -> Q14 */

	681 n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ smpl_b uf_idx ] ), LF_shp_Q29 ); / Q12 */

	682 n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( p sDelDec->LF_AR_Q14 ), silk_LSHIFT32( LF_shp_Q14 >> 16 , 15 ) ) ); /* Q12 */

	683 n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 ); /* Q12 -> Q14 */

	684

	685 /* Input minus prediction plus noise feedback */

	686 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */

	687 tmp1_s32x4 = vaddq_s32( n_AR_Q14_s32x4, n_LF_Q14_s32x4 ); /* Q14 */

	688 tmp2_s32x4 = vaddq_s32( vdupq_n_s32( n_LTP_Q14 ), LPC_pred_Q14_s32x4 ); /* Q13 */

	689 tmp1_s32x4 = vsubq_s32( tmp2_s32x4, tmp1_s32x4 ); /* Q13 */

	690 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 4 ); /* Q10 */

	691 tmp1_s32x4 = vsubq_s32( vdupq_n_s32( x_Q10[ i ] ), tmp1_s32x4 ); /* residual error Q10 */

	692

	693 /* Flip sign depending on dither */

	694 sign_s32x4 = vreinterpretq_s32_u32( vcltq_s32( Seed_s32x4, vdupq_n_s32( 0 ) ) );

	695 tmp1_s32x4 = veorq_s32( tmp1_s32x4, sign_s32x4 );

	696 tmp1_s32x4 = vsubq_s32( tmp1_s32x4, sign_s32x4 );

	697 tmp1_s32x4 = vmaxq_s32( tmp1_s32x4, vdupq_n_s32( -( 31 << 10 ) ) );

	698 tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) );

	699 r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 );

	700

	701 /* Find two quantization level candidates and measure their rate-distort ion */

	702 {

	703 int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q 10 ) );

	704 int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 );

	705 int16x4_t q2_Q10_s16x4;

	706 int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4;

	707 uint32x4_t t_u32x4;

	708

	709 if( Lambda_Q10 > 2048 ) {

	710 /* For aggressive RDO, the bias becomes more than one pulse. */

	711 const int rdo_offset = Lambda_Q10/2 - 512;

	712 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n _s16( rdo_offset ) );

	713 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup _n_s16( -rdo_offset ) );

	714 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must cha nge to 32-bit. */

	715 silk_assert( Lambda_Q10 <= 32767 );

	716

	717 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup _n_s16( 0 ) ) );

	718 q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );

	719 q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4 , vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );

	720 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );

	721 }

	722 {

	723 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s1 6( 0 ) );

	724 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup _n_s16( -1 ) );

	725 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, v dup_n_s16( -1 ) );

	726 int16x4_t tmp1_s16x4, tmp2_s16x4;

	727

	728 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );

	729 tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QU ANT_LEVEL_ADJUST_Q10 ) );

	730 q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) );

	731 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp 1_s16x4 );

	732 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );

	733 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q 10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );

	734 q2_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( 1024 ) );

	735 q2_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 + 1024 - QUANT_LEVEL_ADJUST_Q10 ), q2_Q10_s16x4 );

	736 q2_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q 10 ), q2_Q10_s16x4 );

	737 tmp1_s16x4 = q1_Q10_s16x4;

	738 tmp2_s16x4 = q2_Q10_s16x4;

	739 tmp1_s16x4 = vbsl_s16( vorr_u16( equalMinus1_u16x4, lessThanMinu s1_u16x4 ), vneg_s16( tmp1_s16x4 ), tmp1_s16x4 );

	740 tmp2_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vneg_s16( tmp2_s16x 4 ), tmp2_s16x4 );

	741 rd1_Q10_s32x4 = vmull_s16( tmp1_s16x4, vdup_n_s16( Lambda_Q10 ) );

	742 rd2_Q10_s32x4 = vmull_s16( tmp2_s16x4, vdup_n_s16( Lambda_Q10 ) );

	743 }

	744

	745 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q1_Q10_s16x4 );

	746 rd1_Q10_s32x4 = vmlal_s16( rd1_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );

	747 rd1_Q10_s32x4 = vshrq_n_s32( rd1_Q10_s32x4, 10 );

	748

	749 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q2_Q10_s16x4 );

	750 rd2_Q10_s32x4 = vmlal_s16( rd2_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );

	751 rd2_Q10_s32x4 = vshrq_n_s32( rd2_Q10_s32x4, 10 );

	752

	753 tmp2_s32x4 = vld1q_s32( psDelDec->RD_Q10 );

	754 tmp1_s32x4 = vaddq_s32( tmp2_s32x4, vminq_s32( rd1_Q10_s32x4, rd2_Q1 0_s32x4 ) );

	755 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vmaxq_s32( rd1_Q10_s32x4, rd2_Q1 0_s32x4 ) );

	756 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );

	757 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );

	758 t_u32x4 = vcltq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 );

	759 tmp1_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q1_Q10_s16x4 ), vmovl_s1 6( q2_Q10_s16x4 ) );

	760 tmp2_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q2_Q10_s16x4 ), vmovl_s1 6( q1_Q10_s16x4 ) );

	761 vst1q_s32( psSampleState[ 0 ].Q_Q10, tmp1_s32x4 );

	762 vst1q_s32( psSampleState[ 1 ].Q_Q10, tmp2_s32x4 );

	763 }

	764

	765 {

	766 /* Update states for best quantization */

	767 int32x4_t exc_Q14_s32x4, LPC_exc_Q14_s32x4, xq_Q14_s32x4, sLF_AR_shp _Q14_s32x4;

	768

	769 /* Quantized excitation */

	770 exc_Q14_s32x4 = vshlq_n_s32( tmp1_s32x4, 4 );

	771 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );

	772 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );

	773

	774 /* Add predictions */

	775 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_ Q14 ) );

	776 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );

	777

	778 /* Update states */

	779 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q1 0[ i ] ), 4 ) );

	780 vst1q_s32( psSampleState[ 0 ].Diff_Q14, tmp1_s32x4 );

	781 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );

	782 vst1q_s32( psSampleState[ 0 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q1 4_s32x4, n_LF_Q14_s32x4 ) );

	783 vst1q_s32( psSampleState[ 0 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );

	784 vst1q_s32( psSampleState[ 0 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );

	785 vst1q_s32( psSampleState[ 0 ].xq_Q14, xq_Q14_s32x4 );

	786

	787 /* Quantized excitation */

	788 exc_Q14_s32x4 = vshlq_n_s32( tmp2_s32x4, 4 );

	789 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );

	790 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );

	791

	792 /* Add predictions */

	793 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_ Q14 ) );

	794 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );

	795

	796 /* Update states */

	797 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q1 0[ i ] ), 4 ) );

	798 vst1q_s32( psSampleState[ 1 ].Diff_Q14, tmp1_s32x4 );

	799 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );

	800 vst1q_s32( psSampleState[ 1 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q1 4_s32x4, n_LF_Q14_s32x4 ) );

	801 vst1q_s32( psSampleState[ 1 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );

	802 vst1q_s32( psSampleState[ 1 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );

	803 vst1q_s32( psSampleState[ 1 ].xq_Q14, xq_Q14_s32x4 );

	804 }

	805

	806 smpl_buf_idx = smpl_buf_idx ? ( *smpl_buf_idx - 1 ) : ( DECISION_DELAY - 1);

	807 last_smple_idx = *smpl_buf_idx + decisionDelay + DECISION_DELAY;

	808 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;

	809 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;

	810

	811 /* Find winner */

	812 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];

	813 Winner_ind = 0;

	814 for( k = 1; k < nStatesDelayedDecision; k++ ) {

	815 if( psSampleState[ 0 ].RD_Q10[ k ] < RDmin_Q10 ) {

	816 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];

	817 Winner_ind = k;

	818 }

	819 }

	820

	821 /* Increase RD values of expired states */

	822 {

	823 uint32x4_t t_u32x4;

	824 Winner_rand_state = psDelDec->RandState[ last_smple_idx ][ Winner_in d ];

	825 t_u32x4 = vceqq_s32( vld1q_s32( psDelDec->RandState[ last_smple_idx ] ), vdupq_n_s32( Winner_rand_state ) );

	826 t_u32x4 = vmvnq_u32( t_u32x4 );

	827 t_u32x4 = vshrq_n_u32( t_u32x4, 5 );

	828 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].RD_Q10 );

	829 tmp2_s32x4 = vld1q_s32( psSampleState[ 1 ].RD_Q10 );

	830 tmp1_s32x4 = vaddq_s32( tmp1_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );

	831 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );

	832 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );

	833 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );

	834

	835 /* Find worst in first set and best in second set */

	836 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];

	837 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ 0 ];

	838 RDmax_ind = 0;

	839 RDmin_ind = 0;

	840 for( k = 1; k < nStatesDelayedDecision; k++ ) {

	841 /* find worst in first set */

	842 if( psSampleState[ 0 ].RD_Q10[ k ] > RDmax_Q10 ) {

	843 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ k ];

	844 RDmax_ind = k;

	845 }

	846 /* find best in second set */

	847 if( psSampleState[ 1 ].RD_Q10[ k ] < RDmin_Q10 ) {

	848 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ k ];

	849 RDmin_ind = k;

	850 }

	851 }

	852 }

	853

	854 /* Replace a state if best from second set outperforms worst in first se t */

	855 if( RDmin_Q10 < RDmax_Q10 ) {

	856 opus_int32 (*ptr)[NEON_MAX_DEL_DEC_STATES] = psDelDec->RandState;

	857 const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeo f( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) )

	858 / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) );

	859 /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be update d, though the first several */

	860 /* useless sLPC_Q14[] will be different comparing with C when predic tLPCOrder < NSQ_LPC_BUF_LENGTH. */

	861 /* Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplici ty. */

	862 for( j = i + 1; j < i + NSQ_LPC_BUF_LENGTH; j++ ) {

	863 psDelDec->sLPC_Q14[ j ][ RDmax_ind ] = psDelDec->sLPC_Q14[ j ][ RDmin_ind ];

	864 }

	865 for( j = 0; j < numOthers; j++ ) {

	866 ptr[ j ][ RDmax_ind ] = ptr[ j ][ RDmin_ind ];

	867 }

	868

	869 psSampleState[ 0 ].Q_Q10[ RDmax_ind ] = psSampleState[ 1 ].Q_Q10[ RD min_ind ];

	870 psSampleState[ 0 ].RD_Q10[ RDmax_ind ] = psSampleState[ 1 ].RD_Q10[ RDmin_ind ];

	871 psSampleState[ 0 ].xq_Q14[ RDmax_ind ] = psSampleState[ 1 ].xq_Q14[ RDmin_ind ];

	872 psSampleState[ 0 ].LF_AR_Q14[ RDmax_ind ] = psSampleState[ 1 ].LF_AR _Q14[ RDmin_ind ];

	873 psSampleState[ 0 ].Diff_Q14[ RDmax_ind ] = psSampleState[ 1 ].Diff_Q 14[ RDmin_ind ];

	874 psSampleState[ 0 ].sLTP_shp_Q14[ RDmax_ind ] = psSampleState[ 1 ].sL TP_shp_Q14[ RDmin_ind ];

	875 psSampleState[ 0 ].LPC_exc_Q14[ RDmax_ind ] = psSampleState[ 1 ].LPC _exc_Q14[ RDmin_ind ];

	876 }

	877

	878 /* Write samples from winner to output and long-term filter states */

	879 if( subfr > 0 \|\| i >= decisionDelay ) {

	880 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDe c->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );

	881 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(

	882 silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], d elayedGain_Q10[ last_smple_idx ] ), 8 ) );

	883 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDelDe c->Shape_Q14[ last_smple_idx ][ Winner_ind ];

	884 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDelDe c->Pred_Q15[ last_smple_idx ][ Winner_ind ];

	885 }

	886 NSQ->sLTP_shp_buf_idx++;

	887 NSQ->sLTP_buf_idx++;

	888

	889 /* Update states */

	890 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_s32( psSampleState[ 0 ].LF_AR_Q14 ) );

	891 vst1q_s32( psDelDec->Diff_Q14, vld1q_s32( psSampleState[ 0 ].Diff_Q14 ) );

	892 vst1q_s32( psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ], vld1q_s32( psSa mpleState[ 0 ].xq_Q14 ) );

	893 vst1q_s32( psDelDec->Xq_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );

	894 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].Q_Q10 );

	895 vst1q_s32( psDelDec->Q_Q10[ *smpl_buf_idx ], tmp1_s32x4 );

	896 vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) );

	897 vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleStat e[ 0 ].sLTP_shp_Q14 ) );

	898 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 );

	899 tmp1_s32x4 = vaddq_s32( vld1q_s32( psDelDec->Seed ), tmp1_s32x4 );

	900 vst1q_s32( psDelDec->Seed, tmp1_s32x4 );

	901 vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 );

	902 vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) );

	903 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;

	904 }

	905 /* Update LPC states */

	906 silk_memcpy( psDelDec->sLPC_Q14[ 0 ], psDelDec->sLPC_Q14[ length ], NEON_MAX _DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );

	907

	908 RESTORE_STACK;

	909 }

	910

	911 static OPUS_INLINE void silk_SMULWB_8_neon(

	912 const opus_int16 *a,

	913 const int32x2_t b,

	914 opus_int32 *o

	915 )

	916 {

	917 const int16x8_t a_s16x8 = vld1q_s16( a );

	918 int32x4_t o0_s32x4, o1_s32x4;

	919

	920 o0_s32x4 = vshll_n_s16( vget_low_s16( a_s16x8 ), 15 );

	921 o1_s32x4 = vshll_n_s16( vget_high_s16( a_s16x8 ), 15 );

	922 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b, 0 );

	923 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b, 0 );

	924 vst1q_s32( o, o0_s32x4 );

	925 vst1q_s32( o + 4, o1_s32x4 );

	926 }

	927

	928 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */

	929 static OPUS_INLINE void silk_SMULWW_small_b_4_neon(

	930 opus_int32 *a,

	931 const int32x2_t b_s32x2)

	932 {

	933 int32x4_t o_s32x4;

	934

	935 o_s32x4 = vld1q_s32( a );

	936 o_s32x4 = vqdmulhq_lane_s32( o_s32x4, b_s32x2, 0 );

	937 vst1q_s32( a, o_s32x4 );

	938 }

	939

	940 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */

	941 static OPUS_INLINE void silk_SMULWW_small_b_8_neon(

	942 opus_int32 *a,

	943 const int32x2_t b_s32x2

	944 )

	945 {

	946 int32x4_t o0_s32x4, o1_s32x4;

	947

	948 o0_s32x4 = vld1q_s32( a );

	949 o1_s32x4 = vld1q_s32( a + 4 );

	950 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b_s32x2, 0 );

	951 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b_s32x2, 0 );

	952 vst1q_s32( a, o0_s32x4 );

	953 vst1q_s32( a + 4, o1_s32x4 );

	954 }

	955

	956 static OPUS_INLINE void silk_SMULWW_4_neon(

	957 opus_int32 *a,

	958 const int32x2_t b_s32x2)

	959 {

	960 int32x4_t a_s32x4, o_s32x4;

	961

	962 a_s32x4 = vld1q_s32( a );

	963 o_s32x4 = vqdmulhq_lane_s32( a_s32x4, b_s32x2, 0 );

	964 o_s32x4 = vmlaq_lane_s32( o_s32x4, a_s32x4, b_s32x2, 1 );

	965 vst1q_s32( a, o_s32x4 );

	966 }

	967

	968 static OPUS_INLINE void silk_SMULWW_8_neon(

	969 opus_int32 *a,

	970 const int32x2_t b_s32x2

	971 )

	972 {

	973 int32x4_t a0_s32x4, a1_s32x4, o0_s32x4, o1_s32x4;

	974

	975 a0_s32x4 = vld1q_s32( a );

	976 a1_s32x4 = vld1q_s32( a + 4 );

	977 o0_s32x4 = vqdmulhq_lane_s32( a0_s32x4, b_s32x2, 0 );

	978 o1_s32x4 = vqdmulhq_lane_s32( a1_s32x4, b_s32x2, 0 );

	979 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, a0_s32x4, b_s32x2, 1 );

	980 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, a1_s32x4, b_s32x2, 1 );

	981 vst1q_s32( a, o0_s32x4 );

	982 vst1q_s32( a + 4, o1_s32x4 );

	983 }

	984

	985 static OPUS_INLINE void silk_SMULWW_loop_neon(

	986 const opus_int16 *a,

	987 const opus_int32 b,

	988 opus_int32 *o,

	989 const opus_int loop_num

	990 )

	991 {

	992 opus_int i;

	993 int32x2_t b_s32x2;

	994

	995 b_s32x2 = vdup_n_s32( b );

	996 for( i = 0; i < loop_num - 7; i += 8 ) {

	997 silk_SMULWB_8_neon( a + i, b_s32x2, o + i );

	998 }

	999 for( ; i < loop_num; i++ ) {

	1000 o[ i ] = silk_SMULWW( a[ i ], b );

	1001 }

	1002 }

	1003

	1004 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(

	1005 const silk_encoder_state psEncC, / I Encoder State */

	1006 silk_nsq_state NSQ, / I/O NSQ state */

	1007 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision sta tes */

	1008 const opus_int16 x16[], /* I Input */

	1009 opus_int32 x_sc_Q10[], /* O Input scaled with 1/ Gain in Q10 */

	1010 const opus_int16 sLTP[], /* I Re-whitened LTP stat e in Q0 */

	1011 opus_int32 sLTP_Q15[], /* O LTP state matching s caled input */

	1012 opus_int subfr, /* I Subframe number */

	1013 const opus_int LTP_scale_Q14, /* I LTP state scaling */

	1014 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */

	1015 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */

	1016 const opus_int signal_type, /* I Signal type */

	1017 const opus_int decisionDelay /* I Decision delay */

	1018 )

	1019 {

	1020 opus_int i, lag;

	1021 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;

	1022

	1023 lag = pitchL[ subfr ];

	1024 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );

	1025 silk_assert( inv_gain_Q31 != 0 );

	1026

	1027 /* Scale input */

	1028 inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );

	1029 silk_SMULWW_loop_neon( x16, inv_gain_Q26, x_sc_Q10, psEncC->subfr_length );

	1030

	1031 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */

	1032 if( NSQ->rewhite_flag ) {

	1033 if( subfr == 0 ) {

	1034 /* Do LTP downscaling */

	1035 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );

	1036 }

	1037 silk_SMULWW_loop_neon( sLTP + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, i nv_gain_Q31, sLTP_Q15 + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, lag + LTP_ORDER / 2 );

	1038 }

	1039

	1040 /* Adjust for changing gain */

	1041 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {

	1042 int32x2_t gain_adj_Q16_s32x2;

	1043 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );

	1044

	1045 /* Scale long-term shaping state */

	1046 if( ( gain_adj_Q16 >= -65536 ) && ( gain_adj_Q16 < 65536 ) ) {

	1047 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16, 15 ) ) ;

	1048 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sL TP_shp_buf_idx - 7; i += 8 ) {

	1049 silk_SMULWW_small_b_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_ s32x2 );

	1050 }

	1051 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {

	1052 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_sh p_Q14[ i ] );

	1053 }

	1054

	1055 /* Scale long-term prediction state */

	1056 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {

	1057 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_ buf_idx - decisionDelay - 7; i += 8 ) {

	1058 silk_SMULWW_small_b_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );

	1059 }

	1060 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {

	1061 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );

	1062 }

	1063 }

	1064

	1065 /* Scale scalar states */

	1066 silk_SMULWW_small_b_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );

	1067 silk_SMULWW_small_b_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 );

	1068

	1069 /* Scale short-term prediction and shaping states */

	1070 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {

	1071 silk_SMULWW_small_b_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q1 6_s32x2 );

	1072 }

	1073

	1074 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {

	1075 silk_SMULWW_small_b_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q1 6_s32x2 );

	1076 }

	1077

	1078 for( i = 0; i < DECISION_DELAY; i++ ) {

	1079 silk_SMULWW_small_b_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q 16_s32x2 );

	1080 silk_SMULWW_small_b_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q 16_s32x2 );

	1081 }

	1082 } else {

	1083 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16 & 0x000 0FFFF, 15 ) );

	1084 gain_adj_Q16_s32x2 = vset_lane_s32( gain_adj_Q16 >> 16, gain_adj_Q16 _s32x2, 1 );

	1085 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sL TP_shp_buf_idx - 7; i += 8 ) {

	1086 silk_SMULWW_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );

	1087 }

	1088 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {

	1089 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_sh p_Q14[ i ] );

	1090 }

	1091

	1092 /* Scale long-term prediction state */

	1093 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {

	1094 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_ buf_idx - decisionDelay - 7; i += 8 ) {

	1095 silk_SMULWW_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );

	1096 }

	1097 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {

	1098 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );

	1099 }

	1100 }

	1101

	1102 /* Scale scalar states */

	1103 silk_SMULWW_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );

	1104 silk_SMULWW_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 );

	1105

	1106 /* Scale short-term prediction and shaping states */

	1107 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {

	1108 silk_SMULWW_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );

	1109 }

	1110

	1111 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {

	1112 silk_SMULWW_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );

	1113 }

	1114

	1115 for( i = 0; i < DECISION_DELAY; i++ ) {

	1116 silk_SMULWW_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 );

	1117 silk_SMULWW_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );

	1118 }

	1119 }

	1120

	1121 /* Save inverse gain */

	1122 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];

	1123 }

	1124 }

OLD	NEW

« no previous file with comments | « third_party/opus/src/silk/arm/NSQ_del_dec_arm.h ('k') | third_party/opus/src/silk/arm/NSQ_neon.h » ('j') | no next file with comments »