Index: core/src/fxcodec/fx_libopenjpeg/libopenjpeg20/mct.c |
diff --git a/core/src/fxcodec/fx_libopenjpeg/libopenjpeg20/mct.c b/core/src/fxcodec/fx_libopenjpeg/libopenjpeg20/mct.c |
index 60ee0969d52921a303fbab86104e76571c75ef49..a0da0996604048f26e9740596eb8001280c7045c 100644 |
--- a/core/src/fxcodec/fx_libopenjpeg/libopenjpeg20/mct.c |
+++ b/core/src/fxcodec/fx_libopenjpeg/libopenjpeg20/mct.c |
@@ -40,6 +40,12 @@ |
#ifdef __SSE__ |
#include <xmmintrin.h> |
#endif |
+#ifdef __SSE2__ |
+#include <emmintrin.h> |
+#endif |
+#ifdef __SSE4_1__ |
+#include <smmintrin.h> |
+#endif |
#include "opj_includes.h" |
@@ -66,14 +72,33 @@ const OPJ_FLOAT64 * opj_mct_get_mct_norms_real () |
/* <summary> */ |
/* Foward reversible MCT. */ |
/* </summary> */ |
+#ifdef __SSE2__ |
void opj_mct_encode( |
OPJ_INT32* restrict c0, |
OPJ_INT32* restrict c1, |
OPJ_INT32* restrict c2, |
OPJ_UINT32 n) |
{ |
- OPJ_UINT32 i; |
- for(i = 0; i < n; ++i) { |
+ OPJ_SIZE_T i; |
+ const OPJ_SIZE_T len = n; |
+ |
+ for(i = 0; i < (len & ~3U); i += 4) { |
+ __m128i y, u, v; |
+ __m128i r = _mm_load_si128((const __m128i *)&(c0[i])); |
+ __m128i g = _mm_load_si128((const __m128i *)&(c1[i])); |
+ __m128i b = _mm_load_si128((const __m128i *)&(c2[i])); |
+ y = _mm_add_epi32(g, g); |
+ y = _mm_add_epi32(y, b); |
+ y = _mm_add_epi32(y, r); |
+ y = _mm_srai_epi32(y, 2); |
+ u = _mm_sub_epi32(b, g); |
+ v = _mm_sub_epi32(r, g); |
+ _mm_store_si128((__m128i *)&(c0[i]), y); |
+ _mm_store_si128((__m128i *)&(c1[i]), u); |
+ _mm_store_si128((__m128i *)&(c2[i]), v); |
+ } |
+ |
+ for(; i < len; ++i) { |
OPJ_INT32 r = c0[i]; |
OPJ_INT32 g = c1[i]; |
OPJ_INT32 b = c2[i]; |
@@ -85,10 +110,69 @@ void opj_mct_encode( |
c2[i] = v; |
} |
} |
+#else |
+void opj_mct_encode( |
+ OPJ_INT32* restrict c0, |
+ OPJ_INT32* restrict c1, |
+ OPJ_INT32* restrict c2, |
+ OPJ_UINT32 n) |
+{ |
+ OPJ_SIZE_T i; |
+ const OPJ_SIZE_T len = n; |
+ |
+ for(i = 0; i < len; ++i) { |
+ OPJ_INT32 r = c0[i]; |
+ OPJ_INT32 g = c1[i]; |
+ OPJ_INT32 b = c2[i]; |
+ OPJ_INT32 y = (r + (g * 2) + b) >> 2; |
+ OPJ_INT32 u = b - g; |
+ OPJ_INT32 v = r - g; |
+ c0[i] = y; |
+ c1[i] = u; |
+ c2[i] = v; |
+ } |
+} |
+#endif |
/* <summary> */ |
/* Inverse reversible MCT. */ |
/* </summary> */ |
+#ifdef __SSE2__ |
+void opj_mct_decode( |
+ OPJ_INT32* restrict c0, |
+ OPJ_INT32* restrict c1, |
+ OPJ_INT32* restrict c2, |
+ OPJ_UINT32 n) |
+{ |
+ OPJ_SIZE_T i; |
+ const OPJ_SIZE_T len = n; |
+ |
+ for(i = 0; i < (len & ~3U); i += 4) { |
+ __m128i r, g, b; |
+ __m128i y = _mm_load_si128((const __m128i *)&(c0[i])); |
+ __m128i u = _mm_load_si128((const __m128i *)&(c1[i])); |
+ __m128i v = _mm_load_si128((const __m128i *)&(c2[i])); |
+ g = y; |
+ g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2)); |
+ r = _mm_add_epi32(v, g); |
+ b = _mm_add_epi32(u, g); |
+ _mm_store_si128((__m128i *)&(c0[i]), r); |
+ _mm_store_si128((__m128i *)&(c1[i]), g); |
+ _mm_store_si128((__m128i *)&(c2[i]), b); |
+ } |
+ for (; i < len; ++i) { |
+ OPJ_INT32 y = c0[i]; |
+ OPJ_INT32 u = c1[i]; |
+ OPJ_INT32 v = c2[i]; |
+ OPJ_INT32 g = y - ((u + v) >> 2); |
+ OPJ_INT32 r = v + g; |
+ OPJ_INT32 b = u + g; |
+ c0[i] = r; |
+ c1[i] = g; |
+ c2[i] = b; |
+ } |
+} |
+#else |
void opj_mct_decode( |
OPJ_INT32* restrict c0, |
OPJ_INT32* restrict c1, |
@@ -108,6 +192,7 @@ void opj_mct_decode( |
c2[i] = b; |
} |
} |
+#endif |
/* <summary> */ |
/* Get norm of basis function of reversible MCT. */ |
@@ -119,6 +204,148 @@ OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) { |
/* <summary> */ |
/* Foward irreversible MCT. */ |
/* </summary> */ |
+#ifdef __SSE4_1__ |
+void opj_mct_encode_real( |
+ OPJ_INT32* restrict c0, |
+ OPJ_INT32* restrict c1, |
+ OPJ_INT32* restrict c2, |
+ OPJ_UINT32 n) |
+{ |
+ OPJ_SIZE_T i; |
+ const OPJ_SIZE_T len = n; |
+ |
+ const __m128i ry = _mm_set1_epi32(2449); |
+ const __m128i gy = _mm_set1_epi32(4809); |
+ const __m128i by = _mm_set1_epi32(934); |
+ const __m128i ru = _mm_set1_epi32(1382); |
+ const __m128i gu = _mm_set1_epi32(2714); |
+ /* const __m128i bu = _mm_set1_epi32(4096); */ |
+ /* const __m128i rv = _mm_set1_epi32(4096); */ |
+ const __m128i gv = _mm_set1_epi32(3430); |
+ const __m128i bv = _mm_set1_epi32(666); |
+ const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), _MM_SHUFFLE(1, 0, 1, 0)); |
+ |
+ for(i = 0; i < (len & ~3U); i += 4) { |
+ __m128i lo, hi; |
+ __m128i y, u, v; |
+ __m128i r = _mm_load_si128((const __m128i *)&(c0[i])); |
+ __m128i g = _mm_load_si128((const __m128i *)&(c1[i])); |
+ __m128i b = _mm_load_si128((const __m128i *)&(c2[i])); |
+ |
+ lo = r; |
+ hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); |
+ lo = _mm_mul_epi32(lo, ry); |
+ hi = _mm_mul_epi32(hi, ry); |
+ lo = _mm_add_epi64(lo, mulround); |
+ hi = _mm_add_epi64(hi, mulround); |
+ lo = _mm_srli_epi64(lo, 13); |
+ hi = _mm_slli_epi64(hi, 32-13); |
+ y = _mm_blend_epi16(lo, hi, 0xCC); |
+ |
+ lo = g; |
+ hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); |
+ lo = _mm_mul_epi32(lo, gy); |
+ hi = _mm_mul_epi32(hi, gy); |
+ lo = _mm_add_epi64(lo, mulround); |
+ hi = _mm_add_epi64(hi, mulround); |
+ lo = _mm_srli_epi64(lo, 13); |
+ hi = _mm_slli_epi64(hi, 32-13); |
+ y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); |
+ |
+ lo = b; |
+ hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); |
+ lo = _mm_mul_epi32(lo, by); |
+ hi = _mm_mul_epi32(hi, by); |
+ lo = _mm_add_epi64(lo, mulround); |
+ hi = _mm_add_epi64(hi, mulround); |
+ lo = _mm_srli_epi64(lo, 13); |
+ hi = _mm_slli_epi64(hi, 32-13); |
+ y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); |
+ _mm_store_si128((__m128i *)&(c0[i]), y); |
+ |
+ /*lo = b; |
+ hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); |
+ lo = _mm_mul_epi32(lo, mulround); |
+ hi = _mm_mul_epi32(hi, mulround);*/ |
+ lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0))); |
+ hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1))); |
+ lo = _mm_slli_epi64(lo, 12); |
+ hi = _mm_slli_epi64(hi, 12); |
+ lo = _mm_add_epi64(lo, mulround); |
+ hi = _mm_add_epi64(hi, mulround); |
+ lo = _mm_srli_epi64(lo, 13); |
+ hi = _mm_slli_epi64(hi, 32-13); |
+ u = _mm_blend_epi16(lo, hi, 0xCC); |
+ |
+ lo = r; |
+ hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); |
+ lo = _mm_mul_epi32(lo, ru); |
+ hi = _mm_mul_epi32(hi, ru); |
+ lo = _mm_add_epi64(lo, mulround); |
+ hi = _mm_add_epi64(hi, mulround); |
+ lo = _mm_srli_epi64(lo, 13); |
+ hi = _mm_slli_epi64(hi, 32-13); |
+ u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); |
+ |
+ lo = g; |
+ hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); |
+ lo = _mm_mul_epi32(lo, gu); |
+ hi = _mm_mul_epi32(hi, gu); |
+ lo = _mm_add_epi64(lo, mulround); |
+ hi = _mm_add_epi64(hi, mulround); |
+ lo = _mm_srli_epi64(lo, 13); |
+ hi = _mm_slli_epi64(hi, 32-13); |
+ u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); |
+ _mm_store_si128((__m128i *)&(c1[i]), u); |
+ |
+ /*lo = r; |
+ hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); |
+ lo = _mm_mul_epi32(lo, mulround); |
+ hi = _mm_mul_epi32(hi, mulround);*/ |
+ lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0))); |
+ hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1))); |
+ lo = _mm_slli_epi64(lo, 12); |
+ hi = _mm_slli_epi64(hi, 12); |
+ lo = _mm_add_epi64(lo, mulround); |
+ hi = _mm_add_epi64(hi, mulround); |
+ lo = _mm_srli_epi64(lo, 13); |
+ hi = _mm_slli_epi64(hi, 32-13); |
+ v = _mm_blend_epi16(lo, hi, 0xCC); |
+ |
+ lo = g; |
+ hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); |
+ lo = _mm_mul_epi32(lo, gv); |
+ hi = _mm_mul_epi32(hi, gv); |
+ lo = _mm_add_epi64(lo, mulround); |
+ hi = _mm_add_epi64(hi, mulround); |
+ lo = _mm_srli_epi64(lo, 13); |
+ hi = _mm_slli_epi64(hi, 32-13); |
+ v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); |
+ |
+ lo = b; |
+ hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); |
+ lo = _mm_mul_epi32(lo, bv); |
+ hi = _mm_mul_epi32(hi, bv); |
+ lo = _mm_add_epi64(lo, mulround); |
+ hi = _mm_add_epi64(hi, mulround); |
+ lo = _mm_srli_epi64(lo, 13); |
+ hi = _mm_slli_epi64(hi, 32-13); |
+ v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); |
+ _mm_store_si128((__m128i *)&(c2[i]), v); |
+ } |
+ for(; i < len; ++i) { |
+ OPJ_INT32 r = c0[i]; |
+ OPJ_INT32 g = c1[i]; |
+ OPJ_INT32 b = c2[i]; |
+ OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, 4809) + opj_int_fix_mul(b, 934); |
+ OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, 2714) + opj_int_fix_mul(b, 4096); |
+ OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, 3430) - opj_int_fix_mul(b, 666); |
+ c0[i] = y; |
+ c1[i] = u; |
+ c2[i] = v; |
+ } |
+} |
+#else |
void opj_mct_encode_real( |
OPJ_INT32* restrict c0, |
OPJ_INT32* restrict c1, |
@@ -138,6 +365,7 @@ void opj_mct_encode_real( |
c2[i] = v; |
} |
} |
+#endif |
/* <summary> */ |
/* Inverse irreversible MCT. */ |