OLD | NEW |
(Empty) | |
| 1 /*********************************************************************** |
| 2 Copyright (c) 2017 Google Inc. |
| 3 Redistribution and use in source and binary forms, with or without |
| 4 modification, are permitted provided that the following conditions |
| 5 are met: |
| 6 - Redistributions of source code must retain the above copyright notice, |
| 7 this list of conditions and the following disclaimer. |
| 8 - Redistributions in binary form must reproduce the above copyright |
| 9 notice, this list of conditions and the following disclaimer in the |
| 10 documentation and/or other materials provided with the distribution. |
| 11 - Neither the name of Internet Society, IETF or IETF Trust, nor the |
| 12 names of specific contributors, may be used to endorse or promote |
| 13 products derived from this software without specific prior written |
| 14 permission. |
| 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 25 POSSIBILITY OF SUCH DAMAGE. |
| 26 ***********************************************************************/ |
| 27 |
| 28 #ifdef HAVE_CONFIG_H |
| 29 #include "config.h" |
| 30 #endif |
| 31 |
| 32 #include <arm_neon.h> |
| 33 #ifdef OPUS_CHECK_ASM |
| 34 # include <string.h> |
| 35 # include "stack_alloc.h" |
| 36 #endif |
| 37 #include "SigProc_FIX.h" |
| 38 |
| 39 static inline void silk_biquad_alt_stride2_kernel( const int32x4_t A_L_s32x4, co
nst int32x4_t A_U_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, c
onst int32x4_t in_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2 ) |
| 40 { |
| 41 int32x4_t t_s32x4, out32_Q14_s32x4; |
| 42 |
| 43 *out32_Q14_s32x2 = vadd_s32( vget_low_s32( *S_s32x4 ), t_s32x2 );
/* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} )
*/ |
| 44 *S_s32x4 = vcombine_s32( vget_high_s32( *S_s32x4 ), vdup_n_s32( 0 )
); /* S{0,1} = S{2,3}; S{2,3} = 0;
*/ |
| 45 *out32_Q14_s32x2 = vshl_n_s32( *out32_Q14_s32x2, 2 );
/* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ),
2 ); */ |
| 46 out32_Q14_s32x4 = vcombine_s32( *out32_Q14_s32x2, *out32_Q14_s32x2 );
/* out32_Q14_{0,1,0,1}
*/ |
| 47 t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_L_s32x4 );
/* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_L_Q28 )
*/ |
| 48 *S_s32x4 = vrsraq_n_s32( *S_s32x4, t_s32x4, 14 );
/* S{0,1} = S{2,3} + silk_RSHIFT_ROUND(); S{2,3} = silk_RSHIFT_ROUND();
*/ |
| 49 t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_U_s32x4 );
/* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 )
*/ |
| 50 *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 );
/* S0 = silk_SMLAWB( S{0,1,2,3}, out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 );
*/ |
| 51 t_s32x4 = vqdmulhq_s32( in_s32x4, B_Q28_s32x4 );
/* silk_SMULWB( B_Q28[ {1,1,2,2} ], in{0,1,0,1} )
*/ |
| 52 *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 );
/* S0 = silk_SMLAWB( S0, B_Q28[ {1,1,2,2} ], in{0,1,0,1} );
*/ |
| 53 } |
| 54 |
| 55 void silk_biquad_alt_stride2_neon( |
| 56 const opus_int16 *in, /* I input signal
*/ |
| 57 const opus_int32 *B_Q28, /* I MA coefficients [3]
*/ |
| 58 const opus_int32 *A_Q28, /* I AR coefficients [2]
*/ |
| 59 opus_int32 *S, /* I/O State vector [4]
*/ |
| 60 opus_int16 *out, /* O output signal
*/ |
| 61 const opus_int32 len /* I signal length (must
be even) */ |
| 62 ) |
| 63 { |
| 64 /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ |
| 65 opus_int k = 0; |
| 66 const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 ); |
| 67 const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 ); |
| 68 int16x4_t in_s16x4 = vdup_n_s16( 0 ); |
| 69 int16x4_t out_s16x4; |
| 70 int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2; |
| 71 int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4; |
| 72 int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2; |
| 73 |
| 74 #ifdef OPUS_CHECK_ASM |
| 75 opus_int32 S_c[ 4 ]; |
| 76 VARDECL( opus_int16, out_c ); |
| 77 SAVE_STACK; |
| 78 ALLOC( out_c, 2 * len, opus_int16 ); |
| 79 |
| 80 silk_memcpy( &S_c, S, sizeof( S_c ) ); |
| 81 silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len ); |
| 82 #endif |
| 83 |
| 84 /* Negate A_Q28 values and split in two parts */ |
| 85 A_Q28_s32x2 = vld1_s32( A_Q28 ); |
| 86 A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 ); |
| 87 A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 );
/* ( -A_Q28[] & 0x00003FFF ) << 18
*/ |
| 88 A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s3
2x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15
*/ |
| 89 A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 );
/* silk_RSHIFT( -A_Q28[], 14 )
*/ |
| 90 A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 );
/* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading b
its to conform to C function.) */ |
| 91 A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 );
/* silk_RSHIFT( -A_Q28[], 14 ) << 15
*/ |
| 92 |
| 93 B_Q28_s32x2 = vld1_s32( B_Q28 ); |
| 94 t_s32x2 = vld1_s32( B_Q28 + 1 ); |
| 95 t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 ); |
| 96 t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 ); |
| 97 t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 ); |
| 98 A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] );
/* A{0,0,1,1}_L_Q28 */ |
| 99 A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] );
/* A{0,0,1,1}_U_Q28 */ |
| 100 B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] );
/* B_Q28[ {1,1,2,2} ] */ |
| 101 S_s32x4 = vld1q_s32( S );
/* S0 = S[ 0 ]; S3 = S[ 3 ]; */ |
| 102 S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) )
; /* S2 = S[ 1 ]; S1 = S[ 2 ]; */ |
| 103 S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] ); |
| 104 |
| 105 for( ; k < len - 1; k += 2 ) { |
| 106 int32x4_t in_s32x4[ 2 ], t_s32x4; |
| 107 int32x2_t out32_Q14_s32x2[ 2 ]; |
| 108 |
| 109 /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 *
/ |
| 110 in_s16x4 = vld1_s16( &in[ 2 * k ] );
/* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */ |
| 111 in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 );
/* in{0,1,2,3} << 15 */ |
| 112 t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 );
/* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */ |
| 113 in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_
s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */ |
| 114 in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s
32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */ |
| 115 silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_
low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] ); |
| 116 silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_
high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] ); |
| 117 |
| 118 /* Scale back to Q0 and saturate */ |
| 119 out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1
] ); /* out32_Q14_{0,1,2,3}
*/ |
| 120 out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 );
/* out32_Q14_{0,1,2,3} + (1<<14) - 1
*/ |
| 121 out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 );
/* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2
,3} + (1<<14) - 1, 14 ) ) */ |
| 122 vst1_s16( &out[ 2 * k ], out_s16x4 );
/* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( si
lk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */ |
| 123 } |
| 124 |
| 125 /* Process leftover. */ |
| 126 if( k < len ) { |
| 127 int32x4_t in_s32x4; |
| 128 int32x2_t out32_Q14_s32x2; |
| 129 |
| 130 /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */ |
| 131 in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 );
/* in{0,1} = in[ 2 * k + {0,1} ]; */ |
| 132 in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 );
/* in{0,1} = in[ 2 * k + {0,1} ]; */ |
| 133 in_s32x4 = vshll_n_s16( in_s16x4, 15 );
/* in{0,1} << 15 */ |
| 134 t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2,
0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */ |
| 135 in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_
s32x4 ) ); /* in{0,1,0,1} << 15 */ |
| 136 silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32
x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 ); |
| 137 |
| 138 /* Scale back to Q0 and saturate */ |
| 139 out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 );
/* out32_Q14_{0,1} + (1<<14) - 1
*/ |
| 140 out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 );
/* out32_Q14_{0,1,0,1} + (1<<14) - 1
*/ |
| 141 out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 );
/* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0
,1} + (1<<14) - 1, 14 ) ) */ |
| 142 vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 );
/* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIF
T( out32_Q14_0 + (1<<14) - 1, 14 ) ); */ |
| 143 vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 );
/* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIF
T( out32_Q14_1 + (1<<14) - 1, 14 ) ); */ |
| 144 } |
| 145 |
| 146 vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 );
/* S[ 0 ] = S0; */ |
| 147 vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 );
/* S[ 1 ] = S2; */ |
| 148 vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 );
/* S[ 2 ] = S1; */ |
| 149 vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 );
/* S[ 3 ] = S3; */ |
| 150 |
| 151 #ifdef OPUS_CHECK_ASM |
| 152 silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) ); |
| 153 silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) ); |
| 154 RESTORE_STACK; |
| 155 #endif |
| 156 } |
OLD | NEW |