| Index: silk/fixed/x86/burg_modified_FIX_sse.c
|
| diff --git a/silk/fixed/burg_modified_FIX.c b/silk/fixed/x86/burg_modified_FIX_sse.c
|
| similarity index 66%
|
| copy from silk/fixed/burg_modified_FIX.c
|
| copy to silk/fixed/x86/burg_modified_FIX_sse.c
|
| index db348295bf86d3e2c4b2c2a20b6e1b6c81f31bd6..3756095fbe62dc073911c21d21174c7f44f434e4 100644
|
| --- a/silk/fixed/burg_modified_FIX.c
|
| +++ b/silk/fixed/x86/burg_modified_FIX_sse.c
|
| @@ -1,38 +1,43 @@
|
| -/***********************************************************************
|
| -Copyright (c) 2006-2011, Skype Limited. All rights reserved.
|
| -Redistribution and use in source and binary forms, with or without
|
| -modification, are permitted provided that the following conditions
|
| -are met:
|
| -- Redistributions of source code must retain the above copyright notice,
|
| -this list of conditions and the following disclaimer.
|
| -- Redistributions in binary form must reproduce the above copyright
|
| -notice, this list of conditions and the following disclaimer in the
|
| -documentation and/or other materials provided with the distribution.
|
| -- Neither the name of Internet Society, IETF or IETF Trust, nor the
|
| -names of specific contributors, may be used to endorse or promote
|
| -products derived from this software without specific prior written
|
| -permission.
|
| -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
| -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
| -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
| -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
| -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
| -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
| -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
| -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
| -POSSIBILITY OF SUCH DAMAGE.
|
| -***********************************************************************/
|
| +/* Copyright (c) 2014, Cisco Systems, INC
|
| + Written by XiangMingZhu WeiZhou MinPeng YanWang
|
| +
|
| + Redistribution and use in source and binary forms, with or without
|
| + modification, are permitted provided that the following conditions
|
| + are met:
|
| +
|
| + - Redistributions of source code must retain the above copyright
|
| + notice, this list of conditions and the following disclaimer.
|
| +
|
| + - Redistributions in binary form must reproduce the above copyright
|
| + notice, this list of conditions and the following disclaimer in the
|
| + documentation and/or other materials provided with the distribution.
|
| +
|
| + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
| + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
| + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
| + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
| + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| +*/
|
|
|
| #ifdef HAVE_CONFIG_H
|
| #include "config.h"
|
| #endif
|
|
|
| +#include <xmmintrin.h>
|
| +#include <emmintrin.h>
|
| +#include <smmintrin.h>
|
| +
|
| #include "SigProc_FIX.h"
|
| #include "define.h"
|
| #include "tuning_parameters.h"
|
| #include "pitch.h"
|
| +#include "celt/x86/x86cpu.h"
|
|
|
| #define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
|
|
|
| @@ -42,7 +47,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
| #define MAX_RSHIFTS (32 - QA)
|
|
|
| /* Compute reflection coefficients from input signal */
|
| -void silk_burg_modified(
|
| +void silk_burg_modified_sse4_1(
|
| opus_int32 *res_nrg, /* O Residual energy */
|
| opus_int *res_nrg_Q, /* O Residual energy Q value */
|
| opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
|
| @@ -64,6 +69,9 @@ void silk_burg_modified(
|
| opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ];
|
| opus_int32 xcorr[ SILK_MAX_ORDER_LPC ];
|
|
|
| + __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
|
| + __m128i CONST1 = _mm_set1_epi32(1);
|
| +
|
| silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
|
|
|
| /* Compute autocorrelations, added over subframes */
|
| @@ -91,7 +99,7 @@ void silk_burg_modified(
|
| x_ptr = x + s * subfr_length;
|
| for( n = 1; n < D + 1; n++ ) {
|
| C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
|
| - silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n ), rshifts );
|
| + silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
|
| }
|
| }
|
| } else {
|
| @@ -150,20 +158,108 @@ void silk_burg_modified(
|
| x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts ); /* Q( -rshifts ) */
|
| tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], 17 ); /* Q17 */
|
| tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 ); /* Q17 */
|
| - for( k = 0; k < n; k++ ) {
|
| +
|
| + X1_3210 = _mm_set1_epi32( x1 );
|
| + X2_3210 = _mm_set1_epi32( x2 );
|
| + TMP1_3210 = _mm_setzero_si128();
|
| + TMP2_3210 = _mm_setzero_si128();
|
| + for( k = 0; k < n - 3; k += 4 ) {
|
| + PTR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
|
| + SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
|
| + FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] );
|
| + PTR_3210 = _mm_shuffle_epi32( PTR_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
| + LAST_3210 = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] );
|
| + ATMP_3210 = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] );
|
| +
|
| + T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
|
| + T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
|
| +
|
| + ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
|
| + ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
|
| + ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
|
| +
|
| + FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
|
| + LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
|
| +
|
| + PTR_3210 = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
|
| + SUBFR_3210 = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
|
| +
|
| + _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 );
|
| + _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 );
|
| +
|
| + TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
|
| + TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
|
| + }
|
| +
|
| + TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
|
| + TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
|
| + TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
|
| + TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
|
| +
|
| + tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
|
| + tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
|
| +
|
| + for( ; k < n; k++ ) {
|
| C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
|
| C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
|
| Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
|
| tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
|
| tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
|
| }
|
| - tmp1 = -tmp1; /* Q17 */
|
| - tmp2 = -tmp2; /* Q17 */
|
| - for( k = 0; k <= n; k++ ) {
|
| - CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
|
| - silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) ); /* Q( -rshift ) */
|
| - CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
|
| - silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
|
| +
|
| + tmp1 = -tmp1; /* Q17 */
|
| + tmp2 = -tmp2; /* Q17 */
|
| +
|
| + {
|
| + __m128i xmm_tmp1, xmm_tmp2;
|
| + __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
|
| + __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
|
| +
|
| + xmm_tmp1 = _mm_set1_epi32( tmp1 );
|
| + xmm_tmp2 = _mm_set1_epi32( tmp2 );
|
| +
|
| + for( k = 0; k <= n - 3; k += 4 ) {
|
| + xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
|
| + xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
|
| +
|
| + xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
| +
|
| + xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
|
| + xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
|
| +
|
| + /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
|
| + xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
|
| + xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
|
| +
|
| + xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
|
| + xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
|
| + xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
|
| + xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
|
| +
|
| + xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
|
| + xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
|
| + xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
|
| + xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
|
| +
|
| + xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
|
| + xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
|
| +
|
| + X1_3210 = _mm_loadu_si128( (__m128i *)&CAf[ k ] );
|
| + PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] );
|
| +
|
| + X1_3210 = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
|
| + PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
|
| +
|
| + _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 );
|
| + _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 );
|
| + }
|
| +
|
| + for( ; k <= n; k++ ) {
|
| + CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
|
| + silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) ); /* Q( -rshift ) */
|
| + CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
|
| + silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
|
| + }
|
| }
|
| }
|
| }
|
| @@ -252,12 +348,12 @@ void silk_burg_modified(
|
| if( rshifts > 0 ) {
|
| for( s = 0; s < nb_subfr; s++ ) {
|
| x_ptr = x + s * subfr_length;
|
| - C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D ), rshifts );
|
| + C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
|
| }
|
| } else {
|
| for( s = 0; s < nb_subfr; s++ ) {
|
| x_ptr = x + s * subfr_length;
|
| - C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D ), -rshifts );
|
| + C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
|
| }
|
| }
|
| /* Approximate residual energy */
|
|
|