Index: source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c |
diff --git a/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c |
index 42e0baa05b442bf5d0a4e3d717009b9b45671670..0385c7955c902bf09ad89117e8f20b696855acdf 100644 |
--- a/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c |
+++ b/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c |
@@ -17,17 +17,16 @@ |
d0 = _mm_unpacklo_epi8(d0, zero); \ |
d0 = _mm_add_epi16(in_x, d0); \ |
d0 = _mm_packus_epi16(d0, d0); \ |
- *(int *)dest = _mm_cvtsi128_si32(d0); \ |
- dest += stride; \ |
+ *(int *)(dest) = _mm_cvtsi128_si32(d0); \ |
} |
void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
const __m128i zero = _mm_setzero_si128(); |
const __m128i eight = _mm_set1_epi16(8); |
- const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, |
- (int16_t)cospi_16_64, (int16_t)-cospi_16_64, |
- (int16_t)cospi_24_64, (int16_t)-cospi_8_64, |
- (int16_t)cospi_8_64, (int16_t)cospi_24_64); |
+ const __m128i cst = _mm_setr_epi16( |
+ (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, |
+ (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, |
+ (int16_t)cospi_8_64, (int16_t)cospi_24_64); |
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
__m128i input0, input1, input2, input3; |
@@ -126,28 +125,28 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
// Reconstruction and Store |
{ |
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); |
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); |
- d0 = _mm_unpacklo_epi32(d0, |
- _mm_cvtsi32_si128(*(const int *) (dest + stride))); |
- d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( |
- *(const int *) (dest + stride * 3)), d2); |
- d0 = _mm_unpacklo_epi8(d0, zero); |
- d2 = _mm_unpacklo_epi8(d2, zero); |
- d0 = _mm_add_epi16(d0, input2); |
- d2 = _mm_add_epi16(d2, input3); |
- d0 = _mm_packus_epi16(d0, d2); |
- // store input0 |
- *(int *)dest = _mm_cvtsi128_si32(d0); |
- // store input1 |
- d0 = _mm_srli_si128(d0, 4); |
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
- // store input2 |
- d0 = _mm_srli_si128(d0, 4); |
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
- // store input3 |
- d0 = _mm_srli_si128(d0, 4); |
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); |
+ __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); |
+ d0 = _mm_unpacklo_epi32(d0, |
+ _mm_cvtsi32_si128(*(const int *)(dest + stride))); |
+ d2 = _mm_unpacklo_epi32( |
+ _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); |
+ d0 = _mm_unpacklo_epi8(d0, zero); |
+ d2 = _mm_unpacklo_epi8(d2, zero); |
+ d0 = _mm_add_epi16(d0, input2); |
+ d2 = _mm_add_epi16(d2, input3); |
+ d0 = _mm_packus_epi16(d0, d2); |
+ // store input0 |
+ *(int *)dest = _mm_cvtsi128_si32(d0); |
+ // store input1 |
+ d0 = _mm_srli_si128(d0, 4); |
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
+ // store input2 |
+ d0 = _mm_srli_si128(d0, 4); |
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
+ // store input3 |
+ d0 = _mm_srli_si128(d0, 4); |
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
} |
} |
@@ -162,10 +161,10 @@ void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
dc_value = _mm_set1_epi16(a); |
- RECON_AND_STORE4X4(dest, dc_value); |
- RECON_AND_STORE4X4(dest, dc_value); |
- RECON_AND_STORE4X4(dest, dc_value); |
- RECON_AND_STORE4X4(dest, dc_value); |
+ RECON_AND_STORE4X4(dest + 0 * stride, dc_value); |
+ RECON_AND_STORE4X4(dest + 1 * stride, dc_value); |
+ RECON_AND_STORE4X4(dest + 2 * stride, dc_value); |
+ RECON_AND_STORE4X4(dest + 3 * stride, dc_value); |
} |
static INLINE void transpose_4x4(__m128i *res) { |
@@ -267,8 +266,8 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
const __m128i zero = _mm_setzero_si128(); |
const __m128i eight = _mm_set1_epi16(8); |
- in[0]= _mm_loadu_si128((const __m128i *)(input)); |
- in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); |
+ in[0] = _mm_loadu_si128((const __m128i *)(input)); |
+ in[1] = _mm_loadu_si128((const __m128i *)(input + 8)); |
switch (tx_type) { |
case 0: // DCT_DCT |
@@ -301,28 +300,28 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
// Reconstruction and Store |
{ |
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); |
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); |
- d0 = _mm_unpacklo_epi32(d0, |
- _mm_cvtsi32_si128(*(const int *) (dest + stride))); |
- d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128( |
- *(const int *) (dest + stride * 3))); |
- d0 = _mm_unpacklo_epi8(d0, zero); |
- d2 = _mm_unpacklo_epi8(d2, zero); |
- d0 = _mm_add_epi16(d0, in[0]); |
- d2 = _mm_add_epi16(d2, in[1]); |
- d0 = _mm_packus_epi16(d0, d2); |
- // store result[0] |
- *(int *)dest = _mm_cvtsi128_si32(d0); |
- // store result[1] |
- d0 = _mm_srli_si128(d0, 4); |
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
- // store result[2] |
- d0 = _mm_srli_si128(d0, 4); |
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
- // store result[3] |
- d0 = _mm_srli_si128(d0, 4); |
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); |
+ __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); |
+ d0 = _mm_unpacklo_epi32(d0, |
+ _mm_cvtsi32_si128(*(const int *)(dest + stride))); |
+ d2 = _mm_unpacklo_epi32( |
+ d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3))); |
+ d0 = _mm_unpacklo_epi8(d0, zero); |
+ d2 = _mm_unpacklo_epi8(d2, zero); |
+ d0 = _mm_add_epi16(d0, in[0]); |
+ d2 = _mm_add_epi16(d2, in[1]); |
+ d0 = _mm_packus_epi16(d0, d2); |
+ // store result[0] |
+ *(int *)dest = _mm_cvtsi128_si32(d0); |
+ // store result[1] |
+ d0 = _mm_srli_si128(d0, 4); |
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
+ // store result[2] |
+ d0 = _mm_srli_si128(d0, 4); |
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
+ // store result[3] |
+ d0 = _mm_srli_si128(d0, 4); |
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
} |
} |
@@ -517,7 +516,7 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
const __m128i zero = _mm_setzero_si128(); |
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
- const __m128i final_rounding = _mm_set1_epi16(1<<4); |
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
@@ -551,7 +550,7 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
// 4-stage 1D idct8x8 |
IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, |
- in0, in1, in2, in3, in4, in5, in6, in7); |
+ in0, in1, in2, in3, in4, in5, in6, in7); |
} |
// Final rounding and shift |
@@ -573,14 +572,14 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
in6 = _mm_srai_epi16(in6, 5); |
in7 = _mm_srai_epi16(in7, 5); |
- RECON_AND_STORE(dest, in0); |
- RECON_AND_STORE(dest, in1); |
- RECON_AND_STORE(dest, in2); |
- RECON_AND_STORE(dest, in3); |
- RECON_AND_STORE(dest, in4); |
- RECON_AND_STORE(dest, in5); |
- RECON_AND_STORE(dest, in6); |
- RECON_AND_STORE(dest, in7); |
+ RECON_AND_STORE(dest + 0 * stride, in0); |
+ RECON_AND_STORE(dest + 1 * stride, in1); |
+ RECON_AND_STORE(dest + 2 * stride, in2); |
+ RECON_AND_STORE(dest + 3 * stride, in3); |
+ RECON_AND_STORE(dest + 4 * stride, in4); |
+ RECON_AND_STORE(dest + 5 * stride, in5); |
+ RECON_AND_STORE(dest + 6 * stride, in6); |
+ RECON_AND_STORE(dest + 7 * stride, in7); |
} |
void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
@@ -594,14 +593,14 @@ void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
dc_value = _mm_set1_epi16(a); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
+ RECON_AND_STORE(dest + 0 * stride, dc_value); |
+ RECON_AND_STORE(dest + 1 * stride, dc_value); |
+ RECON_AND_STORE(dest + 2 * stride, dc_value); |
+ RECON_AND_STORE(dest + 3 * stride, dc_value); |
+ RECON_AND_STORE(dest + 4 * stride, dc_value); |
+ RECON_AND_STORE(dest + 5 * stride, dc_value); |
+ RECON_AND_STORE(dest + 6 * stride, dc_value); |
+ RECON_AND_STORE(dest + 7 * stride, dc_value); |
} |
static void idct8_sse2(__m128i *in) { |
@@ -626,7 +625,7 @@ static void idct8_sse2(__m128i *in) { |
// 4-stage 1D idct8x8 |
IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, |
- in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); |
+ in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); |
} |
static void iadst8_sse2(__m128i *in) { |
@@ -656,14 +655,14 @@ static void iadst8_sse2(__m128i *in) { |
array_transpose_8x8(in, in); |
// properly aligned for butterfly input |
- in0 = in[7]; |
- in1 = in[0]; |
- in2 = in[5]; |
- in3 = in[2]; |
- in4 = in[3]; |
- in5 = in[4]; |
- in6 = in[1]; |
- in7 = in[6]; |
+ in0 = in[7]; |
+ in1 = in[0]; |
+ in2 = in[5]; |
+ in3 = in[2]; |
+ in4 = in[3]; |
+ in5 = in[4]; |
+ in6 = in[1]; |
+ in7 = in[6]; |
// column transformation |
// stage 1 |
@@ -857,12 +856,11 @@ static void iadst8_sse2(__m128i *in) { |
in[7] = _mm_sub_epi16(k__const_0, s1); |
} |
- |
void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
int tx_type) { |
__m128i in[8]; |
const __m128i zero = _mm_setzero_si128(); |
- const __m128i final_rounding = _mm_set1_epi16(1<<4); |
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
// load input data |
in[0] = _mm_load_si128((const __m128i *)input); |
@@ -915,20 +913,20 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
in[6] = _mm_srai_epi16(in[6], 5); |
in[7] = _mm_srai_epi16(in[7], 5); |
- RECON_AND_STORE(dest, in[0]); |
- RECON_AND_STORE(dest, in[1]); |
- RECON_AND_STORE(dest, in[2]); |
- RECON_AND_STORE(dest, in[3]); |
- RECON_AND_STORE(dest, in[4]); |
- RECON_AND_STORE(dest, in[5]); |
- RECON_AND_STORE(dest, in[6]); |
- RECON_AND_STORE(dest, in[7]); |
+ RECON_AND_STORE(dest + 0 * stride, in[0]); |
+ RECON_AND_STORE(dest + 1 * stride, in[1]); |
+ RECON_AND_STORE(dest + 2 * stride, in[2]); |
+ RECON_AND_STORE(dest + 3 * stride, in[3]); |
+ RECON_AND_STORE(dest + 4 * stride, in[4]); |
+ RECON_AND_STORE(dest + 5 * stride, in[5]); |
+ RECON_AND_STORE(dest + 6 * stride, in[6]); |
+ RECON_AND_STORE(dest + 7 * stride, in[7]); |
} |
void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
const __m128i zero = _mm_setzero_si128(); |
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
- const __m128i final_rounding = _mm_set1_epi16(1<<4); |
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4); |
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
@@ -953,7 +951,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
// 8x4 Transpose |
TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); |
// Stage1 |
- { //NOLINT |
+ { |
const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); |
const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); |
@@ -976,7 +974,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
} |
// Stage2 |
- { //NOLINT |
+ { |
const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); |
const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); |
@@ -1006,7 +1004,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
} |
// Stage3 |
- { //NOLINT |
+ { |
const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); |
tmp4 = _mm_adds_epi16(stp2_0, stp2_2); |
@@ -1035,7 +1033,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) |
IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, |
- in0, in1, in2, in3, in4, in5, in6, in7); |
+ in0, in1, in2, in3, in4, in5, in6, in7); |
// Final rounding and shift |
in0 = _mm_adds_epi16(in0, final_rounding); |
in1 = _mm_adds_epi16(in1, final_rounding); |
@@ -1055,14 +1053,14 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
in6 = _mm_srai_epi16(in6, 5); |
in7 = _mm_srai_epi16(in7, 5); |
- RECON_AND_STORE(dest, in0); |
- RECON_AND_STORE(dest, in1); |
- RECON_AND_STORE(dest, in2); |
- RECON_AND_STORE(dest, in3); |
- RECON_AND_STORE(dest, in4); |
- RECON_AND_STORE(dest, in5); |
- RECON_AND_STORE(dest, in6); |
- RECON_AND_STORE(dest, in7); |
+ RECON_AND_STORE(dest + 0 * stride, in0); |
+ RECON_AND_STORE(dest + 1 * stride, in1); |
+ RECON_AND_STORE(dest + 2 * stride, in2); |
+ RECON_AND_STORE(dest + 3 * stride, in3); |
+ RECON_AND_STORE(dest + 4 * stride, in4); |
+ RECON_AND_STORE(dest + 5 * stride, in5); |
+ RECON_AND_STORE(dest + 6 * stride, in6); |
+ RECON_AND_STORE(dest + 7 * stride, in7); |
} |
#define IDCT16 \ |
@@ -1305,7 +1303,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, |
int stride) { |
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
- const __m128i final_rounding = _mm_set1_epi16(1<<5); |
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
const __m128i zero = _mm_setzero_si128(); |
const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
@@ -1344,130 +1342,86 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, |
curr1 = l; |
for (i = 0; i < 2; i++) { |
- // 1-D idct |
- |
- // Load input data. |
- in[0] = _mm_load_si128((const __m128i *)input); |
- in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
- in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
- in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
- in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
- in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
- in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
- in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
- in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); |
- in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); |
- in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); |
- in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); |
- in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); |
- in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); |
- in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); |
- in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); |
- |
- array_transpose_8x8(in, in); |
- array_transpose_8x8(in+8, in+8); |
- |
- IDCT16 |
- |
- // Stage7 |
- curr1[0] = _mm_add_epi16(stp2_0, stp1_15); |
- curr1[1] = _mm_add_epi16(stp2_1, stp1_14); |
- curr1[2] = _mm_add_epi16(stp2_2, stp2_13); |
- curr1[3] = _mm_add_epi16(stp2_3, stp2_12); |
- curr1[4] = _mm_add_epi16(stp2_4, stp2_11); |
- curr1[5] = _mm_add_epi16(stp2_5, stp2_10); |
- curr1[6] = _mm_add_epi16(stp2_6, stp1_9); |
- curr1[7] = _mm_add_epi16(stp2_7, stp1_8); |
- curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); |
- curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); |
- curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); |
- curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); |
- curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); |
- curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); |
- curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); |
- curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); |
- |
- curr1 = r; |
- input += 128; |
+ // 1-D idct |
+ |
+ // Load input data. |
+ in[0] = _mm_load_si128((const __m128i *)input); |
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
+ in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
+ in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
+ in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
+ in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); |
+ in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); |
+ in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); |
+ in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); |
+ in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); |
+ in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); |
+ in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); |
+ in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); |
+ |
+ array_transpose_8x8(in, in); |
+ array_transpose_8x8(in + 8, in + 8); |
+ |
+ IDCT16 |
+ |
+ // Stage7 |
+ curr1[0] = _mm_add_epi16(stp2_0, stp1_15); |
+ curr1[1] = _mm_add_epi16(stp2_1, stp1_14); |
+ curr1[2] = _mm_add_epi16(stp2_2, stp2_13); |
+ curr1[3] = _mm_add_epi16(stp2_3, stp2_12); |
+ curr1[4] = _mm_add_epi16(stp2_4, stp2_11); |
+ curr1[5] = _mm_add_epi16(stp2_5, stp2_10); |
+ curr1[6] = _mm_add_epi16(stp2_6, stp1_9); |
+ curr1[7] = _mm_add_epi16(stp2_7, stp1_8); |
+ curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); |
+ curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); |
+ curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); |
+ curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); |
+ curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); |
+ curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); |
+ curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); |
+ curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); |
+ |
+ curr1 = r; |
+ input += 128; |
} |
for (i = 0; i < 2; i++) { |
- // 1-D idct |
- array_transpose_8x8(l+i*8, in); |
- array_transpose_8x8(r+i*8, in+8); |
- |
- IDCT16 |
- |
- // 2-D |
- in[0] = _mm_add_epi16(stp2_0, stp1_15); |
- in[1] = _mm_add_epi16(stp2_1, stp1_14); |
- in[2] = _mm_add_epi16(stp2_2, stp2_13); |
- in[3] = _mm_add_epi16(stp2_3, stp2_12); |
- in[4] = _mm_add_epi16(stp2_4, stp2_11); |
- in[5] = _mm_add_epi16(stp2_5, stp2_10); |
- in[6] = _mm_add_epi16(stp2_6, stp1_9); |
- in[7] = _mm_add_epi16(stp2_7, stp1_8); |
- in[8] = _mm_sub_epi16(stp2_7, stp1_8); |
- in[9] = _mm_sub_epi16(stp2_6, stp1_9); |
- in[10] = _mm_sub_epi16(stp2_5, stp2_10); |
- in[11] = _mm_sub_epi16(stp2_4, stp2_11); |
- in[12] = _mm_sub_epi16(stp2_3, stp2_12); |
- in[13] = _mm_sub_epi16(stp2_2, stp2_13); |
- in[14] = _mm_sub_epi16(stp2_1, stp1_14); |
- in[15] = _mm_sub_epi16(stp2_0, stp1_15); |
+ int j; |
+ // 1-D idct |
+ array_transpose_8x8(l + i * 8, in); |
+ array_transpose_8x8(r + i * 8, in + 8); |
+ |
+ IDCT16 |
+ // 2-D |
+ in[0] = _mm_add_epi16(stp2_0, stp1_15); |
+ in[1] = _mm_add_epi16(stp2_1, stp1_14); |
+ in[2] = _mm_add_epi16(stp2_2, stp2_13); |
+ in[3] = _mm_add_epi16(stp2_3, stp2_12); |
+ in[4] = _mm_add_epi16(stp2_4, stp2_11); |
+ in[5] = _mm_add_epi16(stp2_5, stp2_10); |
+ in[6] = _mm_add_epi16(stp2_6, stp1_9); |
+ in[7] = _mm_add_epi16(stp2_7, stp1_8); |
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8); |
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9); |
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10); |
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11); |
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12); |
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13); |
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14); |
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15); |
+ |
+ for (j = 0; j < 16; ++j) { |
// Final rounding and shift |
- in[0] = _mm_adds_epi16(in[0], final_rounding); |
- in[1] = _mm_adds_epi16(in[1], final_rounding); |
- in[2] = _mm_adds_epi16(in[2], final_rounding); |
- in[3] = _mm_adds_epi16(in[3], final_rounding); |
- in[4] = _mm_adds_epi16(in[4], final_rounding); |
- in[5] = _mm_adds_epi16(in[5], final_rounding); |
- in[6] = _mm_adds_epi16(in[6], final_rounding); |
- in[7] = _mm_adds_epi16(in[7], final_rounding); |
- in[8] = _mm_adds_epi16(in[8], final_rounding); |
- in[9] = _mm_adds_epi16(in[9], final_rounding); |
- in[10] = _mm_adds_epi16(in[10], final_rounding); |
- in[11] = _mm_adds_epi16(in[11], final_rounding); |
- in[12] = _mm_adds_epi16(in[12], final_rounding); |
- in[13] = _mm_adds_epi16(in[13], final_rounding); |
- in[14] = _mm_adds_epi16(in[14], final_rounding); |
- in[15] = _mm_adds_epi16(in[15], final_rounding); |
- |
- in[0] = _mm_srai_epi16(in[0], 6); |
- in[1] = _mm_srai_epi16(in[1], 6); |
- in[2] = _mm_srai_epi16(in[2], 6); |
- in[3] = _mm_srai_epi16(in[3], 6); |
- in[4] = _mm_srai_epi16(in[4], 6); |
- in[5] = _mm_srai_epi16(in[5], 6); |
- in[6] = _mm_srai_epi16(in[6], 6); |
- in[7] = _mm_srai_epi16(in[7], 6); |
- in[8] = _mm_srai_epi16(in[8], 6); |
- in[9] = _mm_srai_epi16(in[9], 6); |
- in[10] = _mm_srai_epi16(in[10], 6); |
- in[11] = _mm_srai_epi16(in[11], 6); |
- in[12] = _mm_srai_epi16(in[12], 6); |
- in[13] = _mm_srai_epi16(in[13], 6); |
- in[14] = _mm_srai_epi16(in[14], 6); |
- in[15] = _mm_srai_epi16(in[15], 6); |
- |
- RECON_AND_STORE(dest, in[0]); |
- RECON_AND_STORE(dest, in[1]); |
- RECON_AND_STORE(dest, in[2]); |
- RECON_AND_STORE(dest, in[3]); |
- RECON_AND_STORE(dest, in[4]); |
- RECON_AND_STORE(dest, in[5]); |
- RECON_AND_STORE(dest, in[6]); |
- RECON_AND_STORE(dest, in[7]); |
- RECON_AND_STORE(dest, in[8]); |
- RECON_AND_STORE(dest, in[9]); |
- RECON_AND_STORE(dest, in[10]); |
- RECON_AND_STORE(dest, in[11]); |
- RECON_AND_STORE(dest, in[12]); |
- RECON_AND_STORE(dest, in[13]); |
- RECON_AND_STORE(dest, in[14]); |
- RECON_AND_STORE(dest, in[15]); |
- |
- dest += 8 - (stride * 16); |
+ in[j] = _mm_adds_epi16(in[j], final_rounding); |
+ in[j] = _mm_srai_epi16(in[j], 6); |
+ RECON_AND_STORE(dest + j * stride, in[j]); |
+ } |
+ |
+ dest += 8; |
} |
} |
@@ -1483,23 +1437,23 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
dc_value = _mm_set1_epi16(a); |
for (i = 0; i < 2; ++i) { |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- dest += 8 - (stride * 16); |
+ RECON_AND_STORE(dest + 0 * stride, dc_value); |
+ RECON_AND_STORE(dest + 1 * stride, dc_value); |
+ RECON_AND_STORE(dest + 2 * stride, dc_value); |
+ RECON_AND_STORE(dest + 3 * stride, dc_value); |
+ RECON_AND_STORE(dest + 4 * stride, dc_value); |
+ RECON_AND_STORE(dest + 5 * stride, dc_value); |
+ RECON_AND_STORE(dest + 6 * stride, dc_value); |
+ RECON_AND_STORE(dest + 7 * stride, dc_value); |
+ RECON_AND_STORE(dest + 8 * stride, dc_value); |
+ RECON_AND_STORE(dest + 9 * stride, dc_value); |
+ RECON_AND_STORE(dest + 10 * stride, dc_value); |
+ RECON_AND_STORE(dest + 11 * stride, dc_value); |
+ RECON_AND_STORE(dest + 12 * stride, dc_value); |
+ RECON_AND_STORE(dest + 13 * stride, dc_value); |
+ RECON_AND_STORE(dest + 14 * stride, dc_value); |
+ RECON_AND_STORE(dest + 15 * stride, dc_value); |
+ dest += 8; |
} |
} |
@@ -2367,7 +2321,7 @@ void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
int stride) { |
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
- const __m128i final_rounding = _mm_set1_epi16(1<<5); |
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
const __m128i zero = _mm_setzero_si128(); |
const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
@@ -2406,7 +2360,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
// Stage2 |
{ |
const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); |
- const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); |
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); |
tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); |
tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); |
@@ -2567,7 +2521,8 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
// Second 1-D inverse transform, performed per 8x16 block |
for (i = 0; i < 2; i++) { |
- array_transpose_4X8(l + 8*i, in); |
+ int j; |
+ array_transpose_4X8(l + 8 * i, in); |
IDCT16_10 |
@@ -2589,59 +2544,14 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
in[14] = _mm_sub_epi16(stp2_1, stp1_14); |
in[15] = _mm_sub_epi16(stp2_0, stp1_15); |
- // Final rounding and shift |
- in[0] = _mm_adds_epi16(in[0], final_rounding); |
- in[1] = _mm_adds_epi16(in[1], final_rounding); |
- in[2] = _mm_adds_epi16(in[2], final_rounding); |
- in[3] = _mm_adds_epi16(in[3], final_rounding); |
- in[4] = _mm_adds_epi16(in[4], final_rounding); |
- in[5] = _mm_adds_epi16(in[5], final_rounding); |
- in[6] = _mm_adds_epi16(in[6], final_rounding); |
- in[7] = _mm_adds_epi16(in[7], final_rounding); |
- in[8] = _mm_adds_epi16(in[8], final_rounding); |
- in[9] = _mm_adds_epi16(in[9], final_rounding); |
- in[10] = _mm_adds_epi16(in[10], final_rounding); |
- in[11] = _mm_adds_epi16(in[11], final_rounding); |
- in[12] = _mm_adds_epi16(in[12], final_rounding); |
- in[13] = _mm_adds_epi16(in[13], final_rounding); |
- in[14] = _mm_adds_epi16(in[14], final_rounding); |
- in[15] = _mm_adds_epi16(in[15], final_rounding); |
- |
- in[0] = _mm_srai_epi16(in[0], 6); |
- in[1] = _mm_srai_epi16(in[1], 6); |
- in[2] = _mm_srai_epi16(in[2], 6); |
- in[3] = _mm_srai_epi16(in[3], 6); |
- in[4] = _mm_srai_epi16(in[4], 6); |
- in[5] = _mm_srai_epi16(in[5], 6); |
- in[6] = _mm_srai_epi16(in[6], 6); |
- in[7] = _mm_srai_epi16(in[7], 6); |
- in[8] = _mm_srai_epi16(in[8], 6); |
- in[9] = _mm_srai_epi16(in[9], 6); |
- in[10] = _mm_srai_epi16(in[10], 6); |
- in[11] = _mm_srai_epi16(in[11], 6); |
- in[12] = _mm_srai_epi16(in[12], 6); |
- in[13] = _mm_srai_epi16(in[13], 6); |
- in[14] = _mm_srai_epi16(in[14], 6); |
- in[15] = _mm_srai_epi16(in[15], 6); |
- |
- RECON_AND_STORE(dest, in[0]); |
- RECON_AND_STORE(dest, in[1]); |
- RECON_AND_STORE(dest, in[2]); |
- RECON_AND_STORE(dest, in[3]); |
- RECON_AND_STORE(dest, in[4]); |
- RECON_AND_STORE(dest, in[5]); |
- RECON_AND_STORE(dest, in[6]); |
- RECON_AND_STORE(dest, in[7]); |
- RECON_AND_STORE(dest, in[8]); |
- RECON_AND_STORE(dest, in[9]); |
- RECON_AND_STORE(dest, in[10]); |
- RECON_AND_STORE(dest, in[11]); |
- RECON_AND_STORE(dest, in[12]); |
- RECON_AND_STORE(dest, in[13]); |
- RECON_AND_STORE(dest, in[14]); |
- RECON_AND_STORE(dest, in[15]); |
- |
- dest += 8 - (stride * 16); |
+ for (j = 0; j < 16; ++j) { |
+ // Final rounding and shift |
+ in[j] = _mm_adds_epi16(in[j], final_rounding); |
+ in[j] = _mm_srai_epi16(in[j], 6); |
+ RECON_AND_STORE(dest + j * stride, in[j]); |
+ } |
+ |
+ dest += 8; |
} |
} |
@@ -3286,7 +3196,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
// Only upper-left 8x8 has non-zero coeff |
void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, |
- int stride) { |
+ int stride) { |
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
const __m128i final_rounding = _mm_set1_epi16(1<<5); |
@@ -3387,9 +3297,9 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, |
LOAD_DQCOEFF(in[31], input); |
array_transpose_8x8(in, in); |
- array_transpose_8x8(in+8, in+8); |
- array_transpose_8x8(in+16, in+16); |
- array_transpose_8x8(in+24, in+24); |
+ array_transpose_8x8(in + 8, in + 8); |
+ array_transpose_8x8(in + 16, in + 16); |
+ array_transpose_8x8(in + 24, in + 24); |
IDCT32 |
@@ -3427,153 +3337,61 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, |
col[30] = _mm_sub_epi16(stp1_1, stp1_30); |
col[31] = _mm_sub_epi16(stp1_0, stp1_31); |
for (i = 0; i < 4; i++) { |
- const __m128i zero = _mm_setzero_si128(); |
- // Transpose 32x8 block to 8x32 block |
- array_transpose_8x8(col+i*8, in); |
- IDCT32_34 |
- |
- // 2_D: Calculate the results and store them to destination. |
- in[0] = _mm_add_epi16(stp1_0, stp1_31); |
- in[1] = _mm_add_epi16(stp1_1, stp1_30); |
- in[2] = _mm_add_epi16(stp1_2, stp1_29); |
- in[3] = _mm_add_epi16(stp1_3, stp1_28); |
- in[4] = _mm_add_epi16(stp1_4, stp1_27); |
- in[5] = _mm_add_epi16(stp1_5, stp1_26); |
- in[6] = _mm_add_epi16(stp1_6, stp1_25); |
- in[7] = _mm_add_epi16(stp1_7, stp1_24); |
- in[8] = _mm_add_epi16(stp1_8, stp1_23); |
- in[9] = _mm_add_epi16(stp1_9, stp1_22); |
- in[10] = _mm_add_epi16(stp1_10, stp1_21); |
- in[11] = _mm_add_epi16(stp1_11, stp1_20); |
- in[12] = _mm_add_epi16(stp1_12, stp1_19); |
- in[13] = _mm_add_epi16(stp1_13, stp1_18); |
- in[14] = _mm_add_epi16(stp1_14, stp1_17); |
- in[15] = _mm_add_epi16(stp1_15, stp1_16); |
- in[16] = _mm_sub_epi16(stp1_15, stp1_16); |
- in[17] = _mm_sub_epi16(stp1_14, stp1_17); |
- in[18] = _mm_sub_epi16(stp1_13, stp1_18); |
- in[19] = _mm_sub_epi16(stp1_12, stp1_19); |
- in[20] = _mm_sub_epi16(stp1_11, stp1_20); |
- in[21] = _mm_sub_epi16(stp1_10, stp1_21); |
- in[22] = _mm_sub_epi16(stp1_9, stp1_22); |
- in[23] = _mm_sub_epi16(stp1_8, stp1_23); |
- in[24] = _mm_sub_epi16(stp1_7, stp1_24); |
- in[25] = _mm_sub_epi16(stp1_6, stp1_25); |
- in[26] = _mm_sub_epi16(stp1_5, stp1_26); |
- in[27] = _mm_sub_epi16(stp1_4, stp1_27); |
- in[28] = _mm_sub_epi16(stp1_3, stp1_28); |
- in[29] = _mm_sub_epi16(stp1_2, stp1_29); |
- in[30] = _mm_sub_epi16(stp1_1, stp1_30); |
- in[31] = _mm_sub_epi16(stp1_0, stp1_31); |
- |
+ int j; |
+ const __m128i zero = _mm_setzero_si128(); |
+ // Transpose 32x8 block to 8x32 block |
+ array_transpose_8x8(col + i * 8, in); |
+ IDCT32_34 |
+ |
+ // 2_D: Calculate the results and store them to destination. |
+ in[0] = _mm_add_epi16(stp1_0, stp1_31); |
+ in[1] = _mm_add_epi16(stp1_1, stp1_30); |
+ in[2] = _mm_add_epi16(stp1_2, stp1_29); |
+ in[3] = _mm_add_epi16(stp1_3, stp1_28); |
+ in[4] = _mm_add_epi16(stp1_4, stp1_27); |
+ in[5] = _mm_add_epi16(stp1_5, stp1_26); |
+ in[6] = _mm_add_epi16(stp1_6, stp1_25); |
+ in[7] = _mm_add_epi16(stp1_7, stp1_24); |
+ in[8] = _mm_add_epi16(stp1_8, stp1_23); |
+ in[9] = _mm_add_epi16(stp1_9, stp1_22); |
+ in[10] = _mm_add_epi16(stp1_10, stp1_21); |
+ in[11] = _mm_add_epi16(stp1_11, stp1_20); |
+ in[12] = _mm_add_epi16(stp1_12, stp1_19); |
+ in[13] = _mm_add_epi16(stp1_13, stp1_18); |
+ in[14] = _mm_add_epi16(stp1_14, stp1_17); |
+ in[15] = _mm_add_epi16(stp1_15, stp1_16); |
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16); |
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17); |
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18); |
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19); |
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20); |
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21); |
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22); |
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23); |
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24); |
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25); |
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26); |
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27); |
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28); |
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29); |
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30); |
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31); |
+ |
+ for (j = 0; j < 32; ++j) { |
// Final rounding and shift |
- in[0] = _mm_adds_epi16(in[0], final_rounding); |
- in[1] = _mm_adds_epi16(in[1], final_rounding); |
- in[2] = _mm_adds_epi16(in[2], final_rounding); |
- in[3] = _mm_adds_epi16(in[3], final_rounding); |
- in[4] = _mm_adds_epi16(in[4], final_rounding); |
- in[5] = _mm_adds_epi16(in[5], final_rounding); |
- in[6] = _mm_adds_epi16(in[6], final_rounding); |
- in[7] = _mm_adds_epi16(in[7], final_rounding); |
- in[8] = _mm_adds_epi16(in[8], final_rounding); |
- in[9] = _mm_adds_epi16(in[9], final_rounding); |
- in[10] = _mm_adds_epi16(in[10], final_rounding); |
- in[11] = _mm_adds_epi16(in[11], final_rounding); |
- in[12] = _mm_adds_epi16(in[12], final_rounding); |
- in[13] = _mm_adds_epi16(in[13], final_rounding); |
- in[14] = _mm_adds_epi16(in[14], final_rounding); |
- in[15] = _mm_adds_epi16(in[15], final_rounding); |
- in[16] = _mm_adds_epi16(in[16], final_rounding); |
- in[17] = _mm_adds_epi16(in[17], final_rounding); |
- in[18] = _mm_adds_epi16(in[18], final_rounding); |
- in[19] = _mm_adds_epi16(in[19], final_rounding); |
- in[20] = _mm_adds_epi16(in[20], final_rounding); |
- in[21] = _mm_adds_epi16(in[21], final_rounding); |
- in[22] = _mm_adds_epi16(in[22], final_rounding); |
- in[23] = _mm_adds_epi16(in[23], final_rounding); |
- in[24] = _mm_adds_epi16(in[24], final_rounding); |
- in[25] = _mm_adds_epi16(in[25], final_rounding); |
- in[26] = _mm_adds_epi16(in[26], final_rounding); |
- in[27] = _mm_adds_epi16(in[27], final_rounding); |
- in[28] = _mm_adds_epi16(in[28], final_rounding); |
- in[29] = _mm_adds_epi16(in[29], final_rounding); |
- in[30] = _mm_adds_epi16(in[30], final_rounding); |
- in[31] = _mm_adds_epi16(in[31], final_rounding); |
- |
- in[0] = _mm_srai_epi16(in[0], 6); |
- in[1] = _mm_srai_epi16(in[1], 6); |
- in[2] = _mm_srai_epi16(in[2], 6); |
- in[3] = _mm_srai_epi16(in[3], 6); |
- in[4] = _mm_srai_epi16(in[4], 6); |
- in[5] = _mm_srai_epi16(in[5], 6); |
- in[6] = _mm_srai_epi16(in[6], 6); |
- in[7] = _mm_srai_epi16(in[7], 6); |
- in[8] = _mm_srai_epi16(in[8], 6); |
- in[9] = _mm_srai_epi16(in[9], 6); |
- in[10] = _mm_srai_epi16(in[10], 6); |
- in[11] = _mm_srai_epi16(in[11], 6); |
- in[12] = _mm_srai_epi16(in[12], 6); |
- in[13] = _mm_srai_epi16(in[13], 6); |
- in[14] = _mm_srai_epi16(in[14], 6); |
- in[15] = _mm_srai_epi16(in[15], 6); |
- in[16] = _mm_srai_epi16(in[16], 6); |
- in[17] = _mm_srai_epi16(in[17], 6); |
- in[18] = _mm_srai_epi16(in[18], 6); |
- in[19] = _mm_srai_epi16(in[19], 6); |
- in[20] = _mm_srai_epi16(in[20], 6); |
- in[21] = _mm_srai_epi16(in[21], 6); |
- in[22] = _mm_srai_epi16(in[22], 6); |
- in[23] = _mm_srai_epi16(in[23], 6); |
- in[24] = _mm_srai_epi16(in[24], 6); |
- in[25] = _mm_srai_epi16(in[25], 6); |
- in[26] = _mm_srai_epi16(in[26], 6); |
- in[27] = _mm_srai_epi16(in[27], 6); |
- in[28] = _mm_srai_epi16(in[28], 6); |
- in[29] = _mm_srai_epi16(in[29], 6); |
- in[30] = _mm_srai_epi16(in[30], 6); |
- in[31] = _mm_srai_epi16(in[31], 6); |
- |
- RECON_AND_STORE(dest, in[0]); |
- RECON_AND_STORE(dest, in[1]); |
- RECON_AND_STORE(dest, in[2]); |
- RECON_AND_STORE(dest, in[3]); |
- RECON_AND_STORE(dest, in[4]); |
- RECON_AND_STORE(dest, in[5]); |
- RECON_AND_STORE(dest, in[6]); |
- RECON_AND_STORE(dest, in[7]); |
- RECON_AND_STORE(dest, in[8]); |
- RECON_AND_STORE(dest, in[9]); |
- RECON_AND_STORE(dest, in[10]); |
- RECON_AND_STORE(dest, in[11]); |
- RECON_AND_STORE(dest, in[12]); |
- RECON_AND_STORE(dest, in[13]); |
- RECON_AND_STORE(dest, in[14]); |
- RECON_AND_STORE(dest, in[15]); |
- RECON_AND_STORE(dest, in[16]); |
- RECON_AND_STORE(dest, in[17]); |
- RECON_AND_STORE(dest, in[18]); |
- RECON_AND_STORE(dest, in[19]); |
- RECON_AND_STORE(dest, in[20]); |
- RECON_AND_STORE(dest, in[21]); |
- RECON_AND_STORE(dest, in[22]); |
- RECON_AND_STORE(dest, in[23]); |
- RECON_AND_STORE(dest, in[24]); |
- RECON_AND_STORE(dest, in[25]); |
- RECON_AND_STORE(dest, in[26]); |
- RECON_AND_STORE(dest, in[27]); |
- RECON_AND_STORE(dest, in[28]); |
- RECON_AND_STORE(dest, in[29]); |
- RECON_AND_STORE(dest, in[30]); |
- RECON_AND_STORE(dest, in[31]); |
- |
- dest += 8 - (stride * 32); |
+ in[j] = _mm_adds_epi16(in[j], final_rounding); |
+ in[j] = _mm_srai_epi16(in[j], 6); |
+ RECON_AND_STORE(dest + j * stride, in[j]); |
} |
+ |
+ dest += 8; |
} |
+} |
void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, |
int stride) { |
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
- const __m128i final_rounding = _mm_set1_epi16(1<<5); |
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5); |
const __m128i zero = _mm_setzero_si128(); |
// idct constants for each stage |
@@ -3640,304 +3458,211 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, |
for (i = 0; i < 4; i++) { |
i32 = (i << 5); |
- // First 1-D idct |
- // Load input data. |
- LOAD_DQCOEFF(in[0], input); |
- LOAD_DQCOEFF(in[8], input); |
- LOAD_DQCOEFF(in[16], input); |
- LOAD_DQCOEFF(in[24], input); |
- LOAD_DQCOEFF(in[1], input); |
- LOAD_DQCOEFF(in[9], input); |
- LOAD_DQCOEFF(in[17], input); |
- LOAD_DQCOEFF(in[25], input); |
- LOAD_DQCOEFF(in[2], input); |
- LOAD_DQCOEFF(in[10], input); |
- LOAD_DQCOEFF(in[18], input); |
- LOAD_DQCOEFF(in[26], input); |
- LOAD_DQCOEFF(in[3], input); |
- LOAD_DQCOEFF(in[11], input); |
- LOAD_DQCOEFF(in[19], input); |
- LOAD_DQCOEFF(in[27], input); |
- |
- LOAD_DQCOEFF(in[4], input); |
- LOAD_DQCOEFF(in[12], input); |
- LOAD_DQCOEFF(in[20], input); |
- LOAD_DQCOEFF(in[28], input); |
- LOAD_DQCOEFF(in[5], input); |
- LOAD_DQCOEFF(in[13], input); |
- LOAD_DQCOEFF(in[21], input); |
- LOAD_DQCOEFF(in[29], input); |
- LOAD_DQCOEFF(in[6], input); |
- LOAD_DQCOEFF(in[14], input); |
- LOAD_DQCOEFF(in[22], input); |
- LOAD_DQCOEFF(in[30], input); |
- LOAD_DQCOEFF(in[7], input); |
- LOAD_DQCOEFF(in[15], input); |
- LOAD_DQCOEFF(in[23], input); |
- LOAD_DQCOEFF(in[31], input); |
- |
- // checking if all entries are zero |
- zero_idx[0] = _mm_or_si128(in[0], in[1]); |
- zero_idx[1] = _mm_or_si128(in[2], in[3]); |
- zero_idx[2] = _mm_or_si128(in[4], in[5]); |
- zero_idx[3] = _mm_or_si128(in[6], in[7]); |
- zero_idx[4] = _mm_or_si128(in[8], in[9]); |
- zero_idx[5] = _mm_or_si128(in[10], in[11]); |
- zero_idx[6] = _mm_or_si128(in[12], in[13]); |
- zero_idx[7] = _mm_or_si128(in[14], in[15]); |
- zero_idx[8] = _mm_or_si128(in[16], in[17]); |
- zero_idx[9] = _mm_or_si128(in[18], in[19]); |
- zero_idx[10] = _mm_or_si128(in[20], in[21]); |
- zero_idx[11] = _mm_or_si128(in[22], in[23]); |
- zero_idx[12] = _mm_or_si128(in[24], in[25]); |
- zero_idx[13] = _mm_or_si128(in[26], in[27]); |
- zero_idx[14] = _mm_or_si128(in[28], in[29]); |
- zero_idx[15] = _mm_or_si128(in[30], in[31]); |
- |
- zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
- zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
- zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
- zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
- zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
- zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
- zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
- zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); |
- |
- zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
- zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
- zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
- zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
- zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
- zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
- zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
- |
- if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { |
- col[i32 + 0] = _mm_setzero_si128(); |
- col[i32 + 1] = _mm_setzero_si128(); |
- col[i32 + 2] = _mm_setzero_si128(); |
- col[i32 + 3] = _mm_setzero_si128(); |
- col[i32 + 4] = _mm_setzero_si128(); |
- col[i32 + 5] = _mm_setzero_si128(); |
- col[i32 + 6] = _mm_setzero_si128(); |
- col[i32 + 7] = _mm_setzero_si128(); |
- col[i32 + 8] = _mm_setzero_si128(); |
- col[i32 + 9] = _mm_setzero_si128(); |
- col[i32 + 10] = _mm_setzero_si128(); |
- col[i32 + 11] = _mm_setzero_si128(); |
- col[i32 + 12] = _mm_setzero_si128(); |
- col[i32 + 13] = _mm_setzero_si128(); |
- col[i32 + 14] = _mm_setzero_si128(); |
- col[i32 + 15] = _mm_setzero_si128(); |
- col[i32 + 16] = _mm_setzero_si128(); |
- col[i32 + 17] = _mm_setzero_si128(); |
- col[i32 + 18] = _mm_setzero_si128(); |
- col[i32 + 19] = _mm_setzero_si128(); |
- col[i32 + 20] = _mm_setzero_si128(); |
- col[i32 + 21] = _mm_setzero_si128(); |
- col[i32 + 22] = _mm_setzero_si128(); |
- col[i32 + 23] = _mm_setzero_si128(); |
- col[i32 + 24] = _mm_setzero_si128(); |
- col[i32 + 25] = _mm_setzero_si128(); |
- col[i32 + 26] = _mm_setzero_si128(); |
- col[i32 + 27] = _mm_setzero_si128(); |
- col[i32 + 28] = _mm_setzero_si128(); |
- col[i32 + 29] = _mm_setzero_si128(); |
- col[i32 + 30] = _mm_setzero_si128(); |
- col[i32 + 31] = _mm_setzero_si128(); |
- continue; |
- } |
- |
- // Transpose 32x8 block to 8x32 block |
- array_transpose_8x8(in, in); |
- array_transpose_8x8(in+8, in+8); |
- array_transpose_8x8(in+16, in+16); |
- array_transpose_8x8(in+24, in+24); |
- |
- IDCT32 |
- |
- // 1_D: Store 32 intermediate results for each 8x32 block. |
- col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
- col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
- col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
- col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
- col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
- col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
- col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); |
- col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); |
- col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); |
- col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); |
- col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); |
- col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); |
- col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); |
- col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); |
- col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); |
- col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); |
- col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); |
- col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); |
- col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); |
- col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); |
- col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); |
- col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); |
- col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); |
- col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); |
- col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
- col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
- col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
- col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
- col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
- col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
- col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
- col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
+ // First 1-D idct |
+ // Load input data. |
+ LOAD_DQCOEFF(in[0], input); |
+ LOAD_DQCOEFF(in[8], input); |
+ LOAD_DQCOEFF(in[16], input); |
+ LOAD_DQCOEFF(in[24], input); |
+ LOAD_DQCOEFF(in[1], input); |
+ LOAD_DQCOEFF(in[9], input); |
+ LOAD_DQCOEFF(in[17], input); |
+ LOAD_DQCOEFF(in[25], input); |
+ LOAD_DQCOEFF(in[2], input); |
+ LOAD_DQCOEFF(in[10], input); |
+ LOAD_DQCOEFF(in[18], input); |
+ LOAD_DQCOEFF(in[26], input); |
+ LOAD_DQCOEFF(in[3], input); |
+ LOAD_DQCOEFF(in[11], input); |
+ LOAD_DQCOEFF(in[19], input); |
+ LOAD_DQCOEFF(in[27], input); |
+ |
+ LOAD_DQCOEFF(in[4], input); |
+ LOAD_DQCOEFF(in[12], input); |
+ LOAD_DQCOEFF(in[20], input); |
+ LOAD_DQCOEFF(in[28], input); |
+ LOAD_DQCOEFF(in[5], input); |
+ LOAD_DQCOEFF(in[13], input); |
+ LOAD_DQCOEFF(in[21], input); |
+ LOAD_DQCOEFF(in[29], input); |
+ LOAD_DQCOEFF(in[6], input); |
+ LOAD_DQCOEFF(in[14], input); |
+ LOAD_DQCOEFF(in[22], input); |
+ LOAD_DQCOEFF(in[30], input); |
+ LOAD_DQCOEFF(in[7], input); |
+ LOAD_DQCOEFF(in[15], input); |
+ LOAD_DQCOEFF(in[23], input); |
+ LOAD_DQCOEFF(in[31], input); |
+ |
+ // checking if all entries are zero |
+ zero_idx[0] = _mm_or_si128(in[0], in[1]); |
+ zero_idx[1] = _mm_or_si128(in[2], in[3]); |
+ zero_idx[2] = _mm_or_si128(in[4], in[5]); |
+ zero_idx[3] = _mm_or_si128(in[6], in[7]); |
+ zero_idx[4] = _mm_or_si128(in[8], in[9]); |
+ zero_idx[5] = _mm_or_si128(in[10], in[11]); |
+ zero_idx[6] = _mm_or_si128(in[12], in[13]); |
+ zero_idx[7] = _mm_or_si128(in[14], in[15]); |
+ zero_idx[8] = _mm_or_si128(in[16], in[17]); |
+ zero_idx[9] = _mm_or_si128(in[18], in[19]); |
+ zero_idx[10] = _mm_or_si128(in[20], in[21]); |
+ zero_idx[11] = _mm_or_si128(in[22], in[23]); |
+ zero_idx[12] = _mm_or_si128(in[24], in[25]); |
+ zero_idx[13] = _mm_or_si128(in[26], in[27]); |
+ zero_idx[14] = _mm_or_si128(in[28], in[29]); |
+ zero_idx[15] = _mm_or_si128(in[30], in[31]); |
+ |
+ zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
+ zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
+ zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
+ zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
+ zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
+ zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
+ zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
+ zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); |
+ |
+ zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
+ zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
+ zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
+ zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
+ zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
+ zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
+ zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
+ |
+ if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { |
+ col[i32 + 0] = _mm_setzero_si128(); |
+ col[i32 + 1] = _mm_setzero_si128(); |
+ col[i32 + 2] = _mm_setzero_si128(); |
+ col[i32 + 3] = _mm_setzero_si128(); |
+ col[i32 + 4] = _mm_setzero_si128(); |
+ col[i32 + 5] = _mm_setzero_si128(); |
+ col[i32 + 6] = _mm_setzero_si128(); |
+ col[i32 + 7] = _mm_setzero_si128(); |
+ col[i32 + 8] = _mm_setzero_si128(); |
+ col[i32 + 9] = _mm_setzero_si128(); |
+ col[i32 + 10] = _mm_setzero_si128(); |
+ col[i32 + 11] = _mm_setzero_si128(); |
+ col[i32 + 12] = _mm_setzero_si128(); |
+ col[i32 + 13] = _mm_setzero_si128(); |
+ col[i32 + 14] = _mm_setzero_si128(); |
+ col[i32 + 15] = _mm_setzero_si128(); |
+ col[i32 + 16] = _mm_setzero_si128(); |
+ col[i32 + 17] = _mm_setzero_si128(); |
+ col[i32 + 18] = _mm_setzero_si128(); |
+ col[i32 + 19] = _mm_setzero_si128(); |
+ col[i32 + 20] = _mm_setzero_si128(); |
+ col[i32 + 21] = _mm_setzero_si128(); |
+ col[i32 + 22] = _mm_setzero_si128(); |
+ col[i32 + 23] = _mm_setzero_si128(); |
+ col[i32 + 24] = _mm_setzero_si128(); |
+ col[i32 + 25] = _mm_setzero_si128(); |
+ col[i32 + 26] = _mm_setzero_si128(); |
+ col[i32 + 27] = _mm_setzero_si128(); |
+ col[i32 + 28] = _mm_setzero_si128(); |
+ col[i32 + 29] = _mm_setzero_si128(); |
+ col[i32 + 30] = _mm_setzero_si128(); |
+ col[i32 + 31] = _mm_setzero_si128(); |
+ continue; |
} |
- for (i = 0; i < 4; i++) { |
- // Second 1-D idct |
- j = i << 3; |
- |
- // Transpose 32x8 block to 8x32 block |
- array_transpose_8x8(col+j, in); |
- array_transpose_8x8(col+j+32, in+8); |
- array_transpose_8x8(col+j+64, in+16); |
- array_transpose_8x8(col+j+96, in+24); |
- |
- IDCT32 |
- |
- // 2_D: Calculate the results and store them to destination. |
- in[0] = _mm_add_epi16(stp1_0, stp1_31); |
- in[1] = _mm_add_epi16(stp1_1, stp1_30); |
- in[2] = _mm_add_epi16(stp1_2, stp1_29); |
- in[3] = _mm_add_epi16(stp1_3, stp1_28); |
- in[4] = _mm_add_epi16(stp1_4, stp1_27); |
- in[5] = _mm_add_epi16(stp1_5, stp1_26); |
- in[6] = _mm_add_epi16(stp1_6, stp1_25); |
- in[7] = _mm_add_epi16(stp1_7, stp1_24); |
- in[8] = _mm_add_epi16(stp1_8, stp1_23); |
- in[9] = _mm_add_epi16(stp1_9, stp1_22); |
- in[10] = _mm_add_epi16(stp1_10, stp1_21); |
- in[11] = _mm_add_epi16(stp1_11, stp1_20); |
- in[12] = _mm_add_epi16(stp1_12, stp1_19); |
- in[13] = _mm_add_epi16(stp1_13, stp1_18); |
- in[14] = _mm_add_epi16(stp1_14, stp1_17); |
- in[15] = _mm_add_epi16(stp1_15, stp1_16); |
- in[16] = _mm_sub_epi16(stp1_15, stp1_16); |
- in[17] = _mm_sub_epi16(stp1_14, stp1_17); |
- in[18] = _mm_sub_epi16(stp1_13, stp1_18); |
- in[19] = _mm_sub_epi16(stp1_12, stp1_19); |
- in[20] = _mm_sub_epi16(stp1_11, stp1_20); |
- in[21] = _mm_sub_epi16(stp1_10, stp1_21); |
- in[22] = _mm_sub_epi16(stp1_9, stp1_22); |
- in[23] = _mm_sub_epi16(stp1_8, stp1_23); |
- in[24] = _mm_sub_epi16(stp1_7, stp1_24); |
- in[25] = _mm_sub_epi16(stp1_6, stp1_25); |
- in[26] = _mm_sub_epi16(stp1_5, stp1_26); |
- in[27] = _mm_sub_epi16(stp1_4, stp1_27); |
- in[28] = _mm_sub_epi16(stp1_3, stp1_28); |
- in[29] = _mm_sub_epi16(stp1_2, stp1_29); |
- in[30] = _mm_sub_epi16(stp1_1, stp1_30); |
- in[31] = _mm_sub_epi16(stp1_0, stp1_31); |
+ // Transpose 32x8 block to 8x32 block |
+ array_transpose_8x8(in, in); |
+ array_transpose_8x8(in + 8, in + 8); |
+ array_transpose_8x8(in + 16, in + 16); |
+ array_transpose_8x8(in + 24, in + 24); |
+ |
+ IDCT32 |
+ |
+ // 1_D: Store 32 intermediate results for each 8x32 block. |
+ col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
+ col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
+ col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
+ col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
+ col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
+ col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
+ col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); |
+ col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); |
+ col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); |
+ col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); |
+ col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); |
+ col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); |
+ col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); |
+ col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); |
+ col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); |
+ col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); |
+ col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); |
+ col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); |
+ col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); |
+ col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); |
+ col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); |
+ col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); |
+ col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); |
+ col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); |
+ col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
+ col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
+ col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
+ col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
+ col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
+ col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
+ col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
+ col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
+ } |
+ for (i = 0; i < 4; i++) { |
+ // Second 1-D idct |
+ j = i << 3; |
+ |
+ // Transpose 32x8 block to 8x32 block |
+ array_transpose_8x8(col + j, in); |
+ array_transpose_8x8(col + j + 32, in + 8); |
+ array_transpose_8x8(col + j + 64, in + 16); |
+ array_transpose_8x8(col + j + 96, in + 24); |
+ |
+ IDCT32 |
+ |
+ // 2_D: Calculate the results and store them to destination. |
+ in[0] = _mm_add_epi16(stp1_0, stp1_31); |
+ in[1] = _mm_add_epi16(stp1_1, stp1_30); |
+ in[2] = _mm_add_epi16(stp1_2, stp1_29); |
+ in[3] = _mm_add_epi16(stp1_3, stp1_28); |
+ in[4] = _mm_add_epi16(stp1_4, stp1_27); |
+ in[5] = _mm_add_epi16(stp1_5, stp1_26); |
+ in[6] = _mm_add_epi16(stp1_6, stp1_25); |
+ in[7] = _mm_add_epi16(stp1_7, stp1_24); |
+ in[8] = _mm_add_epi16(stp1_8, stp1_23); |
+ in[9] = _mm_add_epi16(stp1_9, stp1_22); |
+ in[10] = _mm_add_epi16(stp1_10, stp1_21); |
+ in[11] = _mm_add_epi16(stp1_11, stp1_20); |
+ in[12] = _mm_add_epi16(stp1_12, stp1_19); |
+ in[13] = _mm_add_epi16(stp1_13, stp1_18); |
+ in[14] = _mm_add_epi16(stp1_14, stp1_17); |
+ in[15] = _mm_add_epi16(stp1_15, stp1_16); |
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16); |
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17); |
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18); |
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19); |
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20); |
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21); |
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22); |
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23); |
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24); |
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25); |
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26); |
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27); |
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28); |
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29); |
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30); |
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31); |
+ |
+ for (j = 0; j < 32; ++j) { |
// Final rounding and shift |
- in[0] = _mm_adds_epi16(in[0], final_rounding); |
- in[1] = _mm_adds_epi16(in[1], final_rounding); |
- in[2] = _mm_adds_epi16(in[2], final_rounding); |
- in[3] = _mm_adds_epi16(in[3], final_rounding); |
- in[4] = _mm_adds_epi16(in[4], final_rounding); |
- in[5] = _mm_adds_epi16(in[5], final_rounding); |
- in[6] = _mm_adds_epi16(in[6], final_rounding); |
- in[7] = _mm_adds_epi16(in[7], final_rounding); |
- in[8] = _mm_adds_epi16(in[8], final_rounding); |
- in[9] = _mm_adds_epi16(in[9], final_rounding); |
- in[10] = _mm_adds_epi16(in[10], final_rounding); |
- in[11] = _mm_adds_epi16(in[11], final_rounding); |
- in[12] = _mm_adds_epi16(in[12], final_rounding); |
- in[13] = _mm_adds_epi16(in[13], final_rounding); |
- in[14] = _mm_adds_epi16(in[14], final_rounding); |
- in[15] = _mm_adds_epi16(in[15], final_rounding); |
- in[16] = _mm_adds_epi16(in[16], final_rounding); |
- in[17] = _mm_adds_epi16(in[17], final_rounding); |
- in[18] = _mm_adds_epi16(in[18], final_rounding); |
- in[19] = _mm_adds_epi16(in[19], final_rounding); |
- in[20] = _mm_adds_epi16(in[20], final_rounding); |
- in[21] = _mm_adds_epi16(in[21], final_rounding); |
- in[22] = _mm_adds_epi16(in[22], final_rounding); |
- in[23] = _mm_adds_epi16(in[23], final_rounding); |
- in[24] = _mm_adds_epi16(in[24], final_rounding); |
- in[25] = _mm_adds_epi16(in[25], final_rounding); |
- in[26] = _mm_adds_epi16(in[26], final_rounding); |
- in[27] = _mm_adds_epi16(in[27], final_rounding); |
- in[28] = _mm_adds_epi16(in[28], final_rounding); |
- in[29] = _mm_adds_epi16(in[29], final_rounding); |
- in[30] = _mm_adds_epi16(in[30], final_rounding); |
- in[31] = _mm_adds_epi16(in[31], final_rounding); |
- |
- in[0] = _mm_srai_epi16(in[0], 6); |
- in[1] = _mm_srai_epi16(in[1], 6); |
- in[2] = _mm_srai_epi16(in[2], 6); |
- in[3] = _mm_srai_epi16(in[3], 6); |
- in[4] = _mm_srai_epi16(in[4], 6); |
- in[5] = _mm_srai_epi16(in[5], 6); |
- in[6] = _mm_srai_epi16(in[6], 6); |
- in[7] = _mm_srai_epi16(in[7], 6); |
- in[8] = _mm_srai_epi16(in[8], 6); |
- in[9] = _mm_srai_epi16(in[9], 6); |
- in[10] = _mm_srai_epi16(in[10], 6); |
- in[11] = _mm_srai_epi16(in[11], 6); |
- in[12] = _mm_srai_epi16(in[12], 6); |
- in[13] = _mm_srai_epi16(in[13], 6); |
- in[14] = _mm_srai_epi16(in[14], 6); |
- in[15] = _mm_srai_epi16(in[15], 6); |
- in[16] = _mm_srai_epi16(in[16], 6); |
- in[17] = _mm_srai_epi16(in[17], 6); |
- in[18] = _mm_srai_epi16(in[18], 6); |
- in[19] = _mm_srai_epi16(in[19], 6); |
- in[20] = _mm_srai_epi16(in[20], 6); |
- in[21] = _mm_srai_epi16(in[21], 6); |
- in[22] = _mm_srai_epi16(in[22], 6); |
- in[23] = _mm_srai_epi16(in[23], 6); |
- in[24] = _mm_srai_epi16(in[24], 6); |
- in[25] = _mm_srai_epi16(in[25], 6); |
- in[26] = _mm_srai_epi16(in[26], 6); |
- in[27] = _mm_srai_epi16(in[27], 6); |
- in[28] = _mm_srai_epi16(in[28], 6); |
- in[29] = _mm_srai_epi16(in[29], 6); |
- in[30] = _mm_srai_epi16(in[30], 6); |
- in[31] = _mm_srai_epi16(in[31], 6); |
- |
- RECON_AND_STORE(dest, in[0]); |
- RECON_AND_STORE(dest, in[1]); |
- RECON_AND_STORE(dest, in[2]); |
- RECON_AND_STORE(dest, in[3]); |
- RECON_AND_STORE(dest, in[4]); |
- RECON_AND_STORE(dest, in[5]); |
- RECON_AND_STORE(dest, in[6]); |
- RECON_AND_STORE(dest, in[7]); |
- RECON_AND_STORE(dest, in[8]); |
- RECON_AND_STORE(dest, in[9]); |
- RECON_AND_STORE(dest, in[10]); |
- RECON_AND_STORE(dest, in[11]); |
- RECON_AND_STORE(dest, in[12]); |
- RECON_AND_STORE(dest, in[13]); |
- RECON_AND_STORE(dest, in[14]); |
- RECON_AND_STORE(dest, in[15]); |
- RECON_AND_STORE(dest, in[16]); |
- RECON_AND_STORE(dest, in[17]); |
- RECON_AND_STORE(dest, in[18]); |
- RECON_AND_STORE(dest, in[19]); |
- RECON_AND_STORE(dest, in[20]); |
- RECON_AND_STORE(dest, in[21]); |
- RECON_AND_STORE(dest, in[22]); |
- RECON_AND_STORE(dest, in[23]); |
- RECON_AND_STORE(dest, in[24]); |
- RECON_AND_STORE(dest, in[25]); |
- RECON_AND_STORE(dest, in[26]); |
- RECON_AND_STORE(dest, in[27]); |
- RECON_AND_STORE(dest, in[28]); |
- RECON_AND_STORE(dest, in[29]); |
- RECON_AND_STORE(dest, in[30]); |
- RECON_AND_STORE(dest, in[31]); |
- |
- dest += 8 - (stride * 32); |
+ in[j] = _mm_adds_epi16(in[j], final_rounding); |
+ in[j] = _mm_srai_epi16(in[j], 6); |
+ RECON_AND_STORE(dest + j * stride, in[j]); |
} |
-} //NOLINT |
+ |
+ dest += 8; |
+ } |
+} |
void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
__m128i dc_value; |
@@ -3951,66 +3676,38 @@ void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
dc_value = _mm_set1_epi16(a); |
for (i = 0; i < 4; ++i) { |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- RECON_AND_STORE(dest, dc_value); |
- dest += 8 - (stride * 32); |
+ int j; |
+ for (j = 0; j < 32; ++j) { |
+ RECON_AND_STORE(dest + j * stride, dc_value); |
+ } |
+ dest += 8; |
} |
} |
#if CONFIG_VP9_HIGHBITDEPTH |
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { |
- __m128i ubounded, retval; |
- const __m128i zero = _mm_set1_epi16(0); |
- const __m128i one = _mm_set1_epi16(1); |
- const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); |
- ubounded = _mm_cmpgt_epi16(value, max); |
- retval = _mm_andnot_si128(ubounded, value); |
- ubounded = _mm_and_si128(ubounded, max); |
- retval = _mm_or_si128(retval, ubounded); |
- retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); |
- return retval; |
+ __m128i ubounded, retval; |
+ const __m128i zero = _mm_set1_epi16(0); |
+ const __m128i one = _mm_set1_epi16(1); |
+ const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); |
+ ubounded = _mm_cmpgt_epi16(value, max); |
+ retval = _mm_andnot_si128(ubounded, value); |
+ ubounded = _mm_and_si128(ubounded, max); |
+ retval = _mm_or_si128(retval, ubounded); |
+ retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); |
+ return retval; |
} |
void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, |
- int stride, int bd) { |
+ int stride, int bd) { |
tran_low_t out[4 * 4]; |
tran_low_t *outptr = out; |
int i, j; |
__m128i inptr[4]; |
__m128i sign_bits[2]; |
- __m128i temp_mm, min_input, max_input; |
+ __m128i temp_mm, min_input, max_input; |
int test; |
- uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); |
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
int optimised_cols = 0; |
const __m128i zero = _mm_set1_epi16(0); |
const __m128i eight = _mm_set1_epi16(8); |
@@ -4053,10 +3750,10 @@ void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, |
inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); |
inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); |
inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); |
- _mm_storeu_si128((__m128i*)outptr, inptr[0]); |
- _mm_storeu_si128((__m128i*)(outptr + 4), inptr[1]); |
- _mm_storeu_si128((__m128i*)(outptr + 8), inptr[2]); |
- _mm_storeu_si128((__m128i*)(outptr + 12), inptr[3]); |
+ _mm_storeu_si128((__m128i *)outptr, inptr[0]); |
+ _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); |
+ _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); |
+ _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); |
} else { |
// Set to use the optimised transform for the column |
optimised_cols = 1; |
@@ -4084,10 +3781,10 @@ void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, |
{ |
__m128i d0 = _mm_loadl_epi64((const __m128i *)dest); |
__m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); |
- d0 = _mm_unpacklo_epi64(d0, |
- _mm_loadl_epi64((const __m128i *)(dest + stride))); |
- d2 = _mm_unpacklo_epi64(d2, |
- _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); |
+ d0 = _mm_unpacklo_epi64( |
+ d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); |
+ d2 = _mm_unpacklo_epi64( |
+ d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); |
d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); |
d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); |
// store input0 |
@@ -4118,13 +3815,13 @@ void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, |
} |
void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, |
- int stride, int bd) { |
+ int stride, int bd) { |
tran_low_t out[8 * 8]; |
tran_low_t *outptr = out; |
int i, j, test; |
__m128i inptr[8]; |
__m128i min_input, max_input, temp1, temp2, sign_bits; |
- uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); |
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
const __m128i zero = _mm_set1_epi16(0); |
const __m128i sixteen = _mm_set1_epi16(16); |
const __m128i max = _mm_set1_epi16(6201); |
@@ -4133,8 +3830,8 @@ void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, |
// Load input into __m128i & pack to 16 bits |
for (i = 0; i < 8; i++) { |
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i)); |
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4)); |
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); |
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); |
inptr[i] = _mm_packs_epi32(temp1, temp2); |
} |
@@ -4172,8 +3869,8 @@ void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, |
sign_bits = _mm_cmplt_epi16(inptr[i], zero); |
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); |
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); |
} |
} else { |
// Set to use the optimised transform for the column |
@@ -4219,13 +3916,13 @@ void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, |
} |
void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
- int stride, int bd) { |
+ int stride, int bd) { |
tran_low_t out[8 * 8] = { 0 }; |
tran_low_t *outptr = out; |
int i, j, test; |
__m128i inptr[8]; |
__m128i min_input, max_input, temp1, temp2, sign_bits; |
- uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); |
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
const __m128i zero = _mm_set1_epi16(0); |
const __m128i sixteen = _mm_set1_epi16(16); |
const __m128i max = _mm_set1_epi16(6201); |
@@ -4234,8 +3931,8 @@ void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
// Load input into __m128i & pack to 16 bits |
for (i = 0; i < 8; i++) { |
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i)); |
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4)); |
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); |
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); |
inptr[i] = _mm_packs_epi32(temp1, temp2); |
} |
@@ -4276,8 +3973,8 @@ void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
sign_bits = _mm_cmplt_epi16(inptr[i], zero); |
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); |
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); |
} |
} else { |
// Set to use the optimised transform for the column |
@@ -4323,13 +4020,13 @@ void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
} |
void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, |
- int stride, int bd) { |
+ int stride, int bd) { |
tran_low_t out[16 * 16]; |
tran_low_t *outptr = out; |
int i, j, test; |
__m128i inptr[32]; |
__m128i min_input, max_input, temp1, temp2, sign_bits; |
- uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); |
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
const __m128i zero = _mm_set1_epi16(0); |
const __m128i rounding = _mm_set1_epi16(32); |
const __m128i max = _mm_set1_epi16(3155); |
@@ -4338,11 +4035,11 @@ void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, |
// Load input into __m128i & pack to 16 bits |
for (i = 0; i < 16; i++) { |
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i)); |
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4)); |
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); |
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); |
inptr[i] = _mm_packs_epi32(temp1, temp2); |
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8)); |
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12)); |
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); |
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); |
inptr[i + 16] = _mm_packs_epi32(temp1, temp2); |
} |
@@ -4378,15 +4075,15 @@ void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, |
array_transpose_16x16(inptr, inptr + 16); |
for (i = 0; i < 16; i++) { |
sign_bits = _mm_cmplt_epi16(inptr[i], zero); |
- temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits); |
- temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2); |
- sign_bits = _mm_cmplt_epi16(inptr[i+16], zero); |
- temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits); |
- temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2); |
+ temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); |
+ temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); |
+ sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); |
+ temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); |
+ temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); |
} |
} else { |
// Set to use the optimised transform for the column |
@@ -4437,13 +4134,13 @@ void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, |
} |
void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
- int stride, int bd) { |
+ int stride, int bd) { |
tran_low_t out[16 * 16] = { 0 }; |
tran_low_t *outptr = out; |
int i, j, test; |
__m128i inptr[32]; |
__m128i min_input, max_input, temp1, temp2, sign_bits; |
- uint16_t * dest = CONVERT_TO_SHORTPTR(dest8); |
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
const __m128i zero = _mm_set1_epi16(0); |
const __m128i rounding = _mm_set1_epi16(32); |
const __m128i max = _mm_set1_epi16(3155); |
@@ -4452,11 +4149,11 @@ void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
// Load input into __m128i & pack to 16 bits |
for (i = 0; i < 16; i++) { |
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i)); |
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4)); |
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); |
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); |
inptr[i] = _mm_packs_epi32(temp1, temp2); |
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8)); |
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12)); |
+ temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); |
+ temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); |
inptr[i + 16] = _mm_packs_epi32(temp1, temp2); |
} |
@@ -4497,15 +4194,15 @@ void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, |
array_transpose_8x8(inptr + 8, inptr + 16); |
for (i = 0; i < 4; i++) { |
sign_bits = _mm_cmplt_epi16(inptr[i], zero); |
- temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits); |
- temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2); |
- sign_bits = _mm_cmplt_epi16(inptr[i+16], zero); |
- temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits); |
- temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1); |
- _mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2); |
+ temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); |
+ temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); |
+ sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); |
+ temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); |
+ temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); |
+ _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); |
} |
} else { |
// Set to use the optimised transform for the column |