Index: celt/x86/pitch_sse.h |
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h |
index 695122a5adda014e2ac77a01b6ade6184872ed3a..99d1919a2e62579f40f1d641099bbd7691c9f2f2 100644 |
--- a/celt/x86/pitch_sse.h |
+++ b/celt/x86/pitch_sse.h |
@@ -1,4 +1,5 @@ |
-/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ |
+/* Copyright (c) 2013 Jean-Marc Valin and John Ridges |
+ Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/ |
/** |
@file pitch_sse.h |
@brief Pitch analysis |
@@ -32,11 +33,56 @@ |
#ifndef PITCH_SSE_H |
#define PITCH_SSE_H |
+#if defined(HAVE_CONFIG_H) |
+#include "config.h" |
+#endif |
+ |
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) |
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) |
+void xcorr_kernel_sse4_1( |
+ const opus_int16 *x, |
+ const opus_int16 *y, |
+ opus_val32 sum[4], |
+ int len); |
+ |
+extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( |
+ const opus_int16 *x, |
+ const opus_int16 *y, |
+ opus_val32 sum[4], |
+ int len); |
+ |
+#define OVERRIDE_XCORR_KERNEL |
+#define xcorr_kernel(x, y, sum, len, arch) \ |
+ ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len)) |
+ |
+opus_val32 celt_inner_prod_sse4_1( |
+ const opus_int16 *x, |
+ const opus_int16 *y, |
+ int N); |
+#endif |
+ |
+#if defined(OPUS_X86_MAY_HAVE_SSE2) |
+opus_val32 celt_inner_prod_sse2( |
+ const opus_int16 *x, |
+ const opus_int16 *y, |
+ int N); |
+#endif |
+ |
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( |
+ const opus_int16 *x, |
+ const opus_int16 *y, |
+ int N); |
+ |
+#define OVERRIDE_CELT_INNER_PROD |
+#define celt_inner_prod(x, y, N, arch) \ |
+ ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N)) |
+#else |
+ |
#include <xmmintrin.h> |
#include "arch.h" |
#define OVERRIDE_XCORR_KERNEL |
-static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) |
+static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) |
{ |
int j; |
__m128 xsum1, xsum2; |
@@ -71,6 +117,9 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, o |
_mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); |
} |
+#define xcorr_kernel(_x, _y, _z, len, arch) \ |
+ ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len)) |
+ |
#define OVERRIDE_DUAL_INNER_PROD |
static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, |
int N, opus_val32 *xy1, opus_val32 *xy2) |
@@ -101,6 +150,35 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y |
} |
} |
+#define OVERRIDE_CELT_INNER_PROD |
+static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, |
+ int N) |
+{ |
+ int i; |
+ float xy; |
+ __m128 sum; |
+ sum = _mm_setzero_ps(); |
+ /* FIXME: We should probably go 8-way and use 2 sums. */ |
+ for (i=0;i<N-3;i+=4) |
+ { |
+ __m128 xi = _mm_loadu_ps(x+i); |
+ __m128 yi = _mm_loadu_ps(y+i); |
+ sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); |
+ } |
+ /* Horizontal sum */ |
+ sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); |
+ sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); |
+ _mm_store_ss(&xy, sum); |
+ for (;i<N;i++) |
+ { |
+ xy = MAC16_16(xy, x[i], y[i]); |
+ } |
+ return xy; |
+} |
+ |
+# define celt_inner_prod(_x, _y, len, arch) \ |
+ ((void)(arch),celt_inner_prod_sse(_x, _y, len)) |
+ |
#define OVERRIDE_COMB_FILTER_CONST |
static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, |
opus_val16 g10, opus_val16 g11, opus_val16 g12) |
@@ -154,3 +232,4 @@ static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, i |
} |
#endif |
+#endif |