OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <assert.h> |
| 12 #include <emmintrin.h> // SSE2 |
| 13 #include "./vpx_config.h" |
| 14 #include "vpx/vpx_integer.h" |
| 15 #include "vp9/common/vp9_common.h" |
| 16 #include "vp9/common/vp9_idct.h" |
| 17 |
| 18 // perform 8x8 transpose |
| 19 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { |
| 20 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); |
| 21 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); |
| 22 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); |
| 23 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); |
| 24 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); |
| 25 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); |
| 26 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); |
| 27 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); |
| 28 |
| 29 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
| 30 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); |
| 31 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
| 32 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); |
| 33 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); |
| 34 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); |
| 35 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); |
| 36 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); |
| 37 |
| 38 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); |
| 39 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); |
| 40 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); |
| 41 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); |
| 42 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); |
| 43 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); |
| 44 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); |
| 45 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); |
| 46 } |
| 47 |
| 48 #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ |
| 49 { \ |
| 50 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
| 51 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
| 52 \ |
| 53 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ |
| 54 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ |
| 55 } |
| 56 |
| 57 static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { |
| 58 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); |
| 59 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); |
| 60 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); |
| 61 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); |
| 62 |
| 63 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
| 64 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
| 65 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); |
| 66 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); |
| 67 |
| 68 out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); |
| 69 out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); |
| 70 out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); |
| 71 out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); |
| 72 } |
| 73 |
| 74 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { |
| 75 __m128i tbuf[8]; |
| 76 array_transpose_8x8(res0, res0); |
| 77 array_transpose_8x8(res1, tbuf); |
| 78 array_transpose_8x8(res0 + 8, res1); |
| 79 array_transpose_8x8(res1 + 8, res1 + 8); |
| 80 |
| 81 res0[8] = tbuf[0]; |
| 82 res0[9] = tbuf[1]; |
| 83 res0[10] = tbuf[2]; |
| 84 res0[11] = tbuf[3]; |
| 85 res0[12] = tbuf[4]; |
| 86 res0[13] = tbuf[5]; |
| 87 res0[14] = tbuf[6]; |
| 88 res0[15] = tbuf[7]; |
| 89 } |
| 90 |
| 91 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { |
| 92 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); |
| 93 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); |
| 94 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); |
| 95 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); |
| 96 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); |
| 97 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); |
| 98 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); |
| 99 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); |
| 100 |
| 101 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); |
| 102 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); |
| 103 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); |
| 104 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); |
| 105 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); |
| 106 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); |
| 107 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); |
| 108 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); |
| 109 } |
| 110 |
| 111 #define RECON_AND_STORE(dest, in_x) \ |
| 112 { \ |
| 113 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ |
| 114 d0 = _mm_unpacklo_epi8(d0, zero); \ |
| 115 d0 = _mm_add_epi16(in_x, d0); \ |
| 116 d0 = _mm_packus_epi16(d0, d0); \ |
| 117 _mm_storel_epi64((__m128i *)(dest), d0); \ |
| 118 dest += stride; \ |
| 119 } |
| 120 |
| 121 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { |
| 122 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
| 123 const __m128i zero = _mm_setzero_si128(); |
| 124 // Final rounding and shift |
| 125 in[0] = _mm_adds_epi16(in[0], final_rounding); |
| 126 in[1] = _mm_adds_epi16(in[1], final_rounding); |
| 127 in[2] = _mm_adds_epi16(in[2], final_rounding); |
| 128 in[3] = _mm_adds_epi16(in[3], final_rounding); |
| 129 in[4] = _mm_adds_epi16(in[4], final_rounding); |
| 130 in[5] = _mm_adds_epi16(in[5], final_rounding); |
| 131 in[6] = _mm_adds_epi16(in[6], final_rounding); |
| 132 in[7] = _mm_adds_epi16(in[7], final_rounding); |
| 133 in[8] = _mm_adds_epi16(in[8], final_rounding); |
| 134 in[9] = _mm_adds_epi16(in[9], final_rounding); |
| 135 in[10] = _mm_adds_epi16(in[10], final_rounding); |
| 136 in[11] = _mm_adds_epi16(in[11], final_rounding); |
| 137 in[12] = _mm_adds_epi16(in[12], final_rounding); |
| 138 in[13] = _mm_adds_epi16(in[13], final_rounding); |
| 139 in[14] = _mm_adds_epi16(in[14], final_rounding); |
| 140 in[15] = _mm_adds_epi16(in[15], final_rounding); |
| 141 |
| 142 in[0] = _mm_srai_epi16(in[0], 6); |
| 143 in[1] = _mm_srai_epi16(in[1], 6); |
| 144 in[2] = _mm_srai_epi16(in[2], 6); |
| 145 in[3] = _mm_srai_epi16(in[3], 6); |
| 146 in[4] = _mm_srai_epi16(in[4], 6); |
| 147 in[5] = _mm_srai_epi16(in[5], 6); |
| 148 in[6] = _mm_srai_epi16(in[6], 6); |
| 149 in[7] = _mm_srai_epi16(in[7], 6); |
| 150 in[8] = _mm_srai_epi16(in[8], 6); |
| 151 in[9] = _mm_srai_epi16(in[9], 6); |
| 152 in[10] = _mm_srai_epi16(in[10], 6); |
| 153 in[11] = _mm_srai_epi16(in[11], 6); |
| 154 in[12] = _mm_srai_epi16(in[12], 6); |
| 155 in[13] = _mm_srai_epi16(in[13], 6); |
| 156 in[14] = _mm_srai_epi16(in[14], 6); |
| 157 in[15] = _mm_srai_epi16(in[15], 6); |
| 158 |
| 159 RECON_AND_STORE(dest, in[0]); |
| 160 RECON_AND_STORE(dest, in[1]); |
| 161 RECON_AND_STORE(dest, in[2]); |
| 162 RECON_AND_STORE(dest, in[3]); |
| 163 RECON_AND_STORE(dest, in[4]); |
| 164 RECON_AND_STORE(dest, in[5]); |
| 165 RECON_AND_STORE(dest, in[6]); |
| 166 RECON_AND_STORE(dest, in[7]); |
| 167 RECON_AND_STORE(dest, in[8]); |
| 168 RECON_AND_STORE(dest, in[9]); |
| 169 RECON_AND_STORE(dest, in[10]); |
| 170 RECON_AND_STORE(dest, in[11]); |
| 171 RECON_AND_STORE(dest, in[12]); |
| 172 RECON_AND_STORE(dest, in[13]); |
| 173 RECON_AND_STORE(dest, in[14]); |
| 174 RECON_AND_STORE(dest, in[15]); |
| 175 } |
OLD | NEW |