OLD | NEW |
(Empty) | |
| 1 /*********************************************************************** |
| 2 Copyright (c) 2017 Google Inc. |
| 3 Redistribution and use in source and binary forms, with or without |
| 4 modification, are permitted provided that the following conditions |
| 5 are met: |
| 6 - Redistributions of source code must retain the above copyright notice, |
| 7 this list of conditions and the following disclaimer. |
| 8 - Redistributions in binary form must reproduce the above copyright |
| 9 notice, this list of conditions and the following disclaimer in the |
| 10 documentation and/or other materials provided with the distribution. |
| 11 - Neither the name of Internet Society, IETF or IETF Trust, nor the |
| 12 names of specific contributors, may be used to endorse or promote |
| 13 products derived from this software without specific prior written |
| 14 permission. |
| 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 25 POSSIBILITY OF SUCH DAMAGE. |
| 26 ***********************************************************************/ |
| 27 |
| 28 #ifdef HAVE_CONFIG_H |
| 29 #include "config.h" |
| 30 #endif |
| 31 |
| 32 #include <arm_neon.h> |
| 33 #include "SigProc_FIX.h" |
| 34 #include "define.h" |
| 35 |
| 36 #define QA 24 |
| 37 #define A_LIMIT SILK_FIX_CONST( 0.99975, QA ) |
| 38 |
| 39 #define MUL32_FRAC_Q(a32, b32, Q) ((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL
(a32, b32), Q))) |
| 40 |
| 41 /* The difficulty is how to judge a 64-bit signed integer tmp64 is 32-bit overfl
owed, |
| 42 * since NEON has no 64-bit min, max or comparison instructions. |
| 43 * A failed idea is to compare the results of vmovn(tmp64) and vqmovn(tmp64) whe
ther they are equal or not. |
| 44 * However, this idea fails when the tmp64 is something like 0xFFFFFFF980000000. |
| 45 * Here we know that mult2Q >= 1, so the highest bit (bit 63, sign bit) of tmp64
must equal to bit 62. |
| 46 * tmp64 was shifted left by 1 and we got tmp64'. If high_half(tmp64') != 0 and
high_half(tmp64') != -1, |
| 47 * then we know that bit 31 to bit 63 of tmp64 can not all be the sign bit, and
therefore tmp64 is 32-bit overflowed. |
| 48 * That is, we judge if tmp64' > 0x00000000FFFFFFFF, or tmp64' <= 0xFFFFFFFF0000
0000. |
| 49 * We use narrowing shift right 31 bits to tmp32' to save data bandwidth and ins
tructions. |
| 50 * That is, we judge if tmp32' > 0x00000000, or tmp32' <= 0xFFFFFFFF. |
| 51 */ |
| 52 |
| 53 /* Compute inverse of LPC prediction gain, and */ |
| 54 /* test if LPC coefficients are stable (all poles within unit circle) */ |
| 55 static OPUS_INLINE opus_int32 LPC_inverse_pred_gain_QA_neon( /* O Returns inve
rse prediction gain in energy domain, Q30 */ |
| 56 opus_int32 A_QA[ SILK_MAX_ORDER_LPC ], /* I Prediction c
oefficients */ |
| 57 const opus_int order /* I Prediction o
rder */ |
| 58 ) |
| 59 { |
| 60 opus_int k, n, mult2Q; |
| 61 opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2, tmp1, tmp2; |
| 62 opus_int32 max, min; |
| 63 int32x4_t max_s32x4, min_s32x4; |
| 64 int32x2_t max_s32x2, min_s32x2; |
| 65 |
| 66 max_s32x4 = vdupq_n_s32( silk_int32_MIN ); |
| 67 min_s32x4 = vdupq_n_s32( silk_int32_MAX ); |
| 68 invGain_Q30 = SILK_FIX_CONST( 1, 30 ); |
| 69 for( k = order - 1; k > 0; k-- ) { |
| 70 int32x2_t rc_Q31_s32x2, rc_mult2_s32x2; |
| 71 int64x2_t mult2Q_s64x2; |
| 72 |
| 73 /* Check for stability */ |
| 74 if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) { |
| 75 return 0; |
| 76 } |
| 77 |
| 78 /* Set RC equal to negated AR coef */ |
| 79 rc_Q31 = -silk_LSHIFT( A_QA[ k ], 31 - QA ); |
| 80 |
| 81 /* rc_mult1_Q30 range: [ 1 : 2^30 ] */ |
| 82 rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31,
rc_Q31 ) ); |
| 83 silk_assert( rc_mult1_Q30 > ( 1 << 15 ) ); /* reduce A
_LIMIT if fails */ |
| 84 silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) ); |
| 85 |
| 86 /* Update inverse gain */ |
| 87 /* invGain_Q30 range: [ 0 : 2^30 ] */ |
| 88 invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 ); |
| 89 silk_assert( invGain_Q30 >= 0 ); |
| 90 silk_assert( invGain_Q30 <= ( 1 << 30 ) ); |
| 91 if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 )
) { |
| 92 return 0; |
| 93 } |
| 94 |
| 95 /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */ |
| 96 mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) ); |
| 97 rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 ); |
| 98 |
| 99 /* Update AR coefficient */ |
| 100 rc_Q31_s32x2 = vdup_n_s32( rc_Q31 ); |
| 101 mult2Q_s64x2 = vdupq_n_s64( -mult2Q ); |
| 102 rc_mult2_s32x2 = vdup_n_s32( rc_mult2 ); |
| 103 |
| 104 for( n = 0; n < ( ( k + 1 ) >> 1 ) - 3; n += 4 ) { |
| 105 /* We always calculate extra elements of A_QA buffer when ( k % 4 )
!= 0, to take the advantage of SIMD parallelization. */ |
| 106 int32x4_t tmp1_s32x4, tmp2_s32x4, t0_s32x4, t1_s32x4, s0_s32x4, s1_s
32x4, t_QA0_s32x4, t_QA1_s32x4; |
| 107 int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2; |
| 108 tmp1_s32x4 = vld1q_s32( A_QA + n ); |
| 109 tmp2_s32x4 = vld1q_s32( A_QA + k - n - 4 ); |
| 110 tmp2_s32x4 = vrev64q_s32( tmp2_s32x4 ); |
| 111 tmp2_s32x4 = vcombine_s32( vget_high_s32( tmp2_s32x4 ), vget_low_s3
2( tmp2_s32x4 ) ); |
| 112 t0_s32x4 = vqrdmulhq_lane_s32( tmp2_s32x4, rc_Q31_s32x2, 0 ); |
| 113 t1_s32x4 = vqrdmulhq_lane_s32( tmp1_s32x4, rc_Q31_s32x2, 0 ); |
| 114 t_QA0_s32x4 = vqsubq_s32( tmp1_s32x4, t0_s32x4 ); |
| 115 t_QA1_s32x4 = vqsubq_s32( tmp2_s32x4, t1_s32x4 ); |
| 116 t0_s64x2 = vmull_s32( vget_low_s32 ( t_QA0_s32x4 ), rc_mult2_s32x
2 ); |
| 117 t1_s64x2 = vmull_s32( vget_high_s32( t_QA0_s32x4 ), rc_mult2_s32x
2 ); |
| 118 t2_s64x2 = vmull_s32( vget_low_s32 ( t_QA1_s32x4 ), rc_mult2_s32x
2 ); |
| 119 t3_s64x2 = vmull_s32( vget_high_s32( t_QA1_s32x4 ), rc_mult2_s32x
2 ); |
| 120 t0_s64x2 = vrshlq_s64( t0_s64x2, mult2Q_s64x2 ); |
| 121 t1_s64x2 = vrshlq_s64( t1_s64x2, mult2Q_s64x2 ); |
| 122 t2_s64x2 = vrshlq_s64( t2_s64x2, mult2Q_s64x2 ); |
| 123 t3_s64x2 = vrshlq_s64( t3_s64x2, mult2Q_s64x2 ); |
| 124 t0_s32x4 = vcombine_s32( vmovn_s64( t0_s64x2 ), vmovn_s64( t1_s64
x2 ) ); |
| 125 t1_s32x4 = vcombine_s32( vmovn_s64( t2_s64x2 ), vmovn_s64( t3_s64
x2 ) ); |
| 126 s0_s32x4 = vcombine_s32( vshrn_n_s64( t0_s64x2, 31 ), vshrn_n_s64
( t1_s64x2, 31 ) ); |
| 127 s1_s32x4 = vcombine_s32( vshrn_n_s64( t2_s64x2, 31 ), vshrn_n_s64
( t3_s64x2, 31 ) ); |
| 128 max_s32x4 = vmaxq_s32( max_s32x4, s0_s32x4 ); |
| 129 min_s32x4 = vminq_s32( min_s32x4, s0_s32x4 ); |
| 130 max_s32x4 = vmaxq_s32( max_s32x4, s1_s32x4 ); |
| 131 min_s32x4 = vminq_s32( min_s32x4, s1_s32x4 ); |
| 132 t1_s32x4 = vrev64q_s32( t1_s32x4 ); |
| 133 t1_s32x4 = vcombine_s32( vget_high_s32( t1_s32x4 ), vget_low_s32(
t1_s32x4 ) ); |
| 134 vst1q_s32( A_QA + n, t0_s32x4 ); |
| 135 vst1q_s32( A_QA + k - n - 4, t1_s32x4 ); |
| 136 } |
| 137 for( ; n < (k + 1) >> 1; n++ ) { |
| 138 opus_int64 tmp64; |
| 139 tmp1 = A_QA[ n ]; |
| 140 tmp2 = A_QA[ k - n - 1 ]; |
| 141 tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp1, |
| 142 MUL32_FRAC_Q( tmp2, rc_Q31, 31 ) ), rc_mult2 ), mult2Q); |
| 143 if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) { |
| 144 return 0; |
| 145 } |
| 146 A_QA[ n ] = ( opus_int32 )tmp64; |
| 147 tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp2, |
| 148 MUL32_FRAC_Q( tmp1, rc_Q31, 31 ) ), rc_mult2), mult2Q); |
| 149 if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) { |
| 150 return 0; |
| 151 } |
| 152 A_QA[ k - n - 1 ] = ( opus_int32 )tmp64; |
| 153 } |
| 154 } |
| 155 |
| 156 /* Check for stability */ |
| 157 if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) { |
| 158 return 0; |
| 159 } |
| 160 |
| 161 max_s32x2 = vmax_s32( vget_low_s32( max_s32x4 ), vget_high_s32( max_s32x4 )
); |
| 162 min_s32x2 = vmin_s32( vget_low_s32( min_s32x4 ), vget_high_s32( min_s32x4 )
); |
| 163 max_s32x2 = vmax_s32( max_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterp
ret_s64_s32( max_s32x2 ), 32 ) ) ); |
| 164 min_s32x2 = vmin_s32( min_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterp
ret_s64_s32( min_s32x2 ), 32 ) ) ); |
| 165 max = vget_lane_s32( max_s32x2, 0 ); |
| 166 min = vget_lane_s32( min_s32x2, 0 ); |
| 167 if( ( max > 0 ) || ( min < -1 ) ) { |
| 168 return 0; |
| 169 } |
| 170 |
| 171 /* Set RC equal to negated AR coef */ |
| 172 rc_Q31 = -silk_LSHIFT( A_QA[ 0 ], 31 - QA ); |
| 173 |
| 174 /* Range: [ 1 : 2^30 ] */ |
| 175 rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q
31 ) ); |
| 176 |
| 177 /* Update inverse gain */ |
| 178 /* Range: [ 0 : 2^30 ] */ |
| 179 invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 ); |
| 180 silk_assert( invGain_Q30 >= 0 ); |
| 181 silk_assert( invGain_Q30 <= ( 1 << 30 ) ); |
| 182 if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) { |
| 183 return 0; |
| 184 } |
| 185 |
| 186 return invGain_Q30; |
| 187 } |
| 188 |
| 189 /* For input in Q12 domain */ |
| 190 opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse predi
ction gain in energy domain, Q30 */ |
| 191 const opus_int16 *A_Q12, /* I Prediction coefficien
ts, Q12 [order] */ |
| 192 const opus_int order /* I Prediction order
*/ |
| 193 ) |
| 194 { |
| 195 #ifdef OPUS_CHECK_ASM |
| 196 const opus_int32 invGain_Q30_c = silk_LPC_inverse_pred_gain_c( A_Q12, order
); |
| 197 #endif |
| 198 |
| 199 opus_int32 invGain_Q30; |
| 200 if( ( SILK_MAX_ORDER_LPC != 24 ) || ( order & 1 )) { |
| 201 invGain_Q30 = silk_LPC_inverse_pred_gain_c( A_Q12, order ); |
| 202 } |
| 203 else { |
| 204 opus_int32 Atmp_QA[ SILK_MAX_ORDER_LPC ]; |
| 205 opus_int32 DC_resp; |
| 206 int16x8_t t0_s16x8, t1_s16x8, t2_s16x8; |
| 207 int32x4_t t0_s32x4; |
| 208 const opus_int leftover = order & 7; |
| 209 |
| 210 /* Increase Q domain of the AR coefficients */ |
| 211 t0_s16x8 = vld1q_s16( A_Q12 + 0 ); |
| 212 t1_s16x8 = vld1q_s16( A_Q12 + 8 ); |
| 213 t2_s16x8 = vld1q_s16( A_Q12 + 16 ); |
| 214 t0_s32x4 = vpaddlq_s16( t0_s16x8 ); |
| 215 |
| 216 switch( order - leftover ) |
| 217 { |
| 218 case 24: |
| 219 t0_s32x4 = vpadalq_s16( t0_s32x4, t2_s16x8 ); |
| 220 /* Intend to fall through */ |
| 221 |
| 222 case 16: |
| 223 t0_s32x4 = vpadalq_s16( t0_s32x4, t1_s16x8 ); |
| 224 vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA
- 12 ) ); |
| 225 vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA
- 12 ) ); |
| 226 /* Intend to fall through */ |
| 227 |
| 228 case 8: |
| 229 { |
| 230 const int32x2_t t_s32x2 = vpadd_s32( vget_low_s32( t0_s32x4 ), vget_
high_s32( t0_s32x4 ) ); |
| 231 const int64x1_t t_s64x1 = vpaddl_s32( t_s32x2 ); |
| 232 DC_resp = vget_lane_s32( vreinterpret_s32_s64( t_s64x1 ), 0 ); |
| 233 vst1q_s32( Atmp_QA + 8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA
- 12 ) ); |
| 234 vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA
- 12 ) ); |
| 235 } |
| 236 break; |
| 237 |
| 238 default: |
| 239 DC_resp = 0; |
| 240 break; |
| 241 } |
| 242 A_Q12 += order - leftover; |
| 243 |
| 244 switch( leftover ) |
| 245 { |
| 246 case 6: |
| 247 DC_resp += (opus_int32)A_Q12[ 5 ]; |
| 248 DC_resp += (opus_int32)A_Q12[ 4 ]; |
| 249 /* Intend to fall through */ |
| 250 |
| 251 case 4: |
| 252 DC_resp += (opus_int32)A_Q12[ 3 ]; |
| 253 DC_resp += (opus_int32)A_Q12[ 2 ]; |
| 254 /* Intend to fall through */ |
| 255 |
| 256 case 2: |
| 257 DC_resp += (opus_int32)A_Q12[ 1 ]; |
| 258 DC_resp += (opus_int32)A_Q12[ 0 ]; |
| 259 /* Intend to fall through */ |
| 260 |
| 261 default: |
| 262 break; |
| 263 } |
| 264 |
| 265 /* If the DC is unstable, we don't even need to do the full calculations
*/ |
| 266 if( DC_resp >= 4096 ) { |
| 267 invGain_Q30 = 0; |
| 268 } else { |
| 269 vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA -
12 ) ); |
| 270 vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA -
12 ) ); |
| 271 invGain_Q30 = LPC_inverse_pred_gain_QA_neon( Atmp_QA, order ); |
| 272 } |
| 273 } |
| 274 |
| 275 #ifdef OPUS_CHECK_ASM |
| 276 silk_assert( invGain_Q30_c == invGain_Q30 ); |
| 277 #endif |
| 278 |
| 279 return invGain_Q30; |
| 280 } |
OLD | NEW |