| Index: celt/x86/pitch_sse.h
|
| diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
|
| index 695122a5adda014e2ac77a01b6ade6184872ed3a..99d1919a2e62579f40f1d641099bbd7691c9f2f2 100644
|
| --- a/celt/x86/pitch_sse.h
|
| +++ b/celt/x86/pitch_sse.h
|
| @@ -1,4 +1,5 @@
|
| -/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
|
| +/* Copyright (c) 2013 Jean-Marc Valin and John Ridges
|
| + Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/
|
| /**
|
| @file pitch_sse.h
|
| @brief Pitch analysis
|
| @@ -32,11 +33,56 @@
|
| #ifndef PITCH_SSE_H
|
| #define PITCH_SSE_H
|
|
|
| +#if defined(HAVE_CONFIG_H)
|
| +#include "config.h"
|
| +#endif
|
| +
|
| +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
|
| +#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
|
| +void xcorr_kernel_sse4_1(
|
| + const opus_int16 *x,
|
| + const opus_int16 *y,
|
| + opus_val32 sum[4],
|
| + int len);
|
| +
|
| +extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
|
| + const opus_int16 *x,
|
| + const opus_int16 *y,
|
| + opus_val32 sum[4],
|
| + int len);
|
| +
|
| +#define OVERRIDE_XCORR_KERNEL
|
| +#define xcorr_kernel(x, y, sum, len, arch) \
|
| + ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
|
| +
|
| +opus_val32 celt_inner_prod_sse4_1(
|
| + const opus_int16 *x,
|
| + const opus_int16 *y,
|
| + int N);
|
| +#endif
|
| +
|
| +#if defined(OPUS_X86_MAY_HAVE_SSE2)
|
| +opus_val32 celt_inner_prod_sse2(
|
| + const opus_int16 *x,
|
| + const opus_int16 *y,
|
| + int N);
|
| +#endif
|
| +
|
| +extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
|
| + const opus_int16 *x,
|
| + const opus_int16 *y,
|
| + int N);
|
| +
|
| +#define OVERRIDE_CELT_INNER_PROD
|
| +#define celt_inner_prod(x, y, N, arch) \
|
| + ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
|
| +#else
|
| +
|
| #include <xmmintrin.h>
|
| #include "arch.h"
|
|
|
| #define OVERRIDE_XCORR_KERNEL
|
| -static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
|
| +static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
|
| {
|
| int j;
|
| __m128 xsum1, xsum2;
|
| @@ -71,6 +117,9 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, o
|
| _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
|
| }
|
|
|
| +#define xcorr_kernel(_x, _y, _z, len, arch) \
|
| + ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len))
|
| +
|
| #define OVERRIDE_DUAL_INNER_PROD
|
| static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
|
| int N, opus_val32 *xy1, opus_val32 *xy2)
|
| @@ -101,6 +150,35 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
|
| }
|
| }
|
|
|
| +#define OVERRIDE_CELT_INNER_PROD
|
| +static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
|
| + int N)
|
| +{
|
| + int i;
|
| + float xy;
|
| + __m128 sum;
|
| + sum = _mm_setzero_ps();
|
| + /* FIXME: We should probably go 8-way and use 2 sums. */
|
| + for (i=0;i<N-3;i+=4)
|
| + {
|
| + __m128 xi = _mm_loadu_ps(x+i);
|
| + __m128 yi = _mm_loadu_ps(y+i);
|
| + sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
|
| + }
|
| + /* Horizontal sum */
|
| + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
|
| + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
|
| + _mm_store_ss(&xy, sum);
|
| + for (;i<N;i++)
|
| + {
|
| + xy = MAC16_16(xy, x[i], y[i]);
|
| + }
|
| + return xy;
|
| +}
|
| +
|
| +# define celt_inner_prod(_x, _y, len, arch) \
|
| + ((void)(arch),celt_inner_prod_sse(_x, _y, len))
|
| +
|
| #define OVERRIDE_COMB_FILTER_CONST
|
| static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
|
| opus_val16 g10, opus_val16 g11, opus_val16 g12)
|
| @@ -154,3 +232,4 @@ static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, i
|
| }
|
|
|
| #endif
|
| +#endif
|
|
|