| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license | |
| 5 * that can be found in the LICENSE file in the root of the source | |
| 6 * tree. An additional intellectual property rights grant can be found | |
| 7 * in the file PATENTS. All contributing project authors may | |
| 8 * be found in the AUTHORS file in the root of the source tree. | |
| 9 */ | |
| 10 | |
| 11 #include <emmintrin.h> // SSE2 | |
| 12 #include "vp9/common/vp9_idct.h" // for cospi constants | |
| 13 #include "vp9/encoder/vp9_dct.h" | |
| 14 #include "vp9/encoder/x86/vp9_dct_sse2.h" | |
| 15 #include "vpx_ports/mem.h" | |
| 16 | |
| 17 #if DCT_HIGH_BIT_DEPTH | |
| 18 #define ADD_EPI16 _mm_adds_epi16 | |
| 19 #define SUB_EPI16 _mm_subs_epi16 | |
| 20 | |
| 21 #else | |
| 22 #define ADD_EPI16 _mm_add_epi16 | |
| 23 #define SUB_EPI16 _mm_sub_epi16 | |
| 24 #endif | |
| 25 | |
| 26 void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { | |
| 27 // This 2D transform implements 4 vertical 1D transforms followed | |
| 28 // by 4 horizontal 1D transforms. The multiplies and adds are as given | |
| 29 // by Chen, Smith and Fralick ('77). The commands for moving the data | |
| 30 // around have been minimized by hand. | |
| 31 // For the purposes of the comments, the 16 inputs are referred to at i0 | |
| 32 // through iF (in raster order), intermediate variables are a0, b0, c0 | |
| 33 // through f, and correspond to the in-place computations mapped to input | |
| 34 // locations. The outputs, o0 through oF are labeled according to the | |
| 35 // output locations. | |
| 36 | |
| 37 // Constants | |
| 38 // These are the coefficients used for the multiplies. | |
| 39 // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), | |
| 40 // where cospi_N_64 = cos(N pi /64) | |
| 41 const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64, | |
| 42 cospi_16_64, cospi_16_64, | |
| 43 cospi_16_64, -cospi_16_64, | |
| 44 cospi_16_64, -cospi_16_64); | |
| 45 const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64, | |
| 46 cospi_16_64, -cospi_16_64, | |
| 47 cospi_16_64, cospi_16_64, | |
| 48 cospi_16_64, cospi_16_64); | |
| 49 const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64, | |
| 50 cospi_8_64, cospi_24_64, | |
| 51 cospi_24_64, -cospi_8_64, | |
| 52 cospi_24_64, -cospi_8_64); | |
| 53 const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64, | |
| 54 cospi_24_64, -cospi_8_64, | |
| 55 cospi_8_64, cospi_24_64, | |
| 56 cospi_8_64, cospi_24_64); | |
| 57 const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64, | |
| 58 cospi_16_64, cospi_16_64, | |
| 59 cospi_16_64, cospi_16_64, | |
| 60 cospi_16_64, cospi_16_64); | |
| 61 const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64, | |
| 62 cospi_16_64, -cospi_16_64, | |
| 63 cospi_16_64, -cospi_16_64, | |
| 64 cospi_16_64, -cospi_16_64); | |
| 65 const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64, | |
| 66 cospi_8_64, cospi_24_64, | |
| 67 -cospi_8_64, -cospi_24_64, | |
| 68 -cospi_8_64, -cospi_24_64); | |
| 69 const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64, | |
| 70 cospi_24_64, -cospi_8_64, | |
| 71 -cospi_24_64, cospi_8_64, | |
| 72 -cospi_24_64, cospi_8_64); | |
| 73 | |
| 74 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | |
| 75 // This second rounding constant saves doing some extra adds at the end | |
| 76 const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING | |
| 77 +(DCT_CONST_ROUNDING << 1)); | |
| 78 const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; | |
| 79 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); | |
| 80 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); | |
| 81 __m128i in0, in1; | |
| 82 #if DCT_HIGH_BIT_DEPTH | |
| 83 __m128i cmp0, cmp1; | |
| 84 int test, overflow; | |
| 85 #endif | |
| 86 | |
| 87 // Load inputs. | |
| 88 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); | |
| 89 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); | |
| 90 in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) | |
| 91 (input + 2 * stride))); | |
| 92 in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) | |
| 93 (input + 3 * stride))); | |
| 94 // in0 = [i0 i1 i2 i3 iC iD iE iF] | |
| 95 // in1 = [i4 i5 i6 i7 i8 i9 iA iB] | |
| 96 #if DCT_HIGH_BIT_DEPTH | |
| 97 // Check inputs small enough to use optimised code | |
| 98 cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)), | |
| 99 _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00))); | |
| 100 cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)), | |
| 101 _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00))); | |
| 102 test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1)); | |
| 103 if (test) { | |
| 104 vp9_highbd_fdct4x4_c(input, output, stride); | |
| 105 return; | |
| 106 } | |
| 107 #endif // DCT_HIGH_BIT_DEPTH | |
| 108 | |
| 109 // multiply by 16 to give some extra precision | |
| 110 in0 = _mm_slli_epi16(in0, 4); | |
| 111 in1 = _mm_slli_epi16(in1, 4); | |
| 112 // if (i == 0 && input[0]) input[0] += 1; | |
| 113 // add 1 to the upper left pixel if it is non-zero, which helps reduce | |
| 114 // the round-trip error | |
| 115 { | |
| 116 // The mask will only contain whether the first value is zero, all | |
| 117 // other comparison will fail as something shifted by 4 (above << 4) | |
| 118 // can never be equal to one. To increment in the non-zero case, we | |
| 119 // add the mask and one for the first element: | |
| 120 // - if zero, mask = -1, v = v - 1 + 1 = v | |
| 121 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 | |
| 122 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); | |
| 123 in0 = _mm_add_epi16(in0, mask); | |
| 124 in0 = _mm_add_epi16(in0, k__nonzero_bias_b); | |
| 125 } | |
| 126 // There are 4 total stages, alternating between an add/subtract stage | |
| 127 // followed by an multiply-and-add stage. | |
| 128 { | |
| 129 // Stage 1: Add/subtract | |
| 130 | |
| 131 // in0 = [i0 i1 i2 i3 iC iD iE iF] | |
| 132 // in1 = [i4 i5 i6 i7 i8 i9 iA iB] | |
| 133 const __m128i r0 = _mm_unpacklo_epi16(in0, in1); | |
| 134 const __m128i r1 = _mm_unpackhi_epi16(in0, in1); | |
| 135 // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] | |
| 136 // r1 = [iC i8 iD i9 iE iA iF iB] | |
| 137 const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); | |
| 138 const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); | |
| 139 // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] | |
| 140 // r3 = [iC i8 iD i9 iF iB iE iA] | |
| 141 | |
| 142 const __m128i t0 = _mm_add_epi16(r2, r3); | |
| 143 const __m128i t1 = _mm_sub_epi16(r2, r3); | |
| 144 // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] | |
| 145 // t1 = [aC a8 aD a9 aF aB aE aA] | |
| 146 | |
| 147 // Stage 2: multiply by constants (which gets us into 32 bits). | |
| 148 // The constants needed here are: | |
| 149 // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] | |
| 150 // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] | |
| 151 // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] | |
| 152 // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] | |
| 153 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); | |
| 154 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); | |
| 155 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); | |
| 156 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); | |
| 157 // Then add and right-shift to get back to 16-bit range | |
| 158 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); | |
| 159 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); | |
| 160 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); | |
| 161 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); | |
| 162 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); | |
| 163 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); | |
| 164 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); | |
| 165 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); | |
| 166 // w0 = [b0 b1 b7 b6] | |
| 167 // w1 = [b8 b9 bF bE] | |
| 168 // w2 = [b4 b5 b3 b2] | |
| 169 // w3 = [bC bD bB bA] | |
| 170 const __m128i x0 = _mm_packs_epi32(w0, w1); | |
| 171 const __m128i x1 = _mm_packs_epi32(w2, w3); | |
| 172 #if DCT_HIGH_BIT_DEPTH | |
| 173 overflow = check_epi16_overflow_x2(&x0, &x1); | |
| 174 if (overflow) { | |
| 175 vp9_highbd_fdct4x4_c(input, output, stride); | |
| 176 return; | |
| 177 } | |
| 178 #endif // DCT_HIGH_BIT_DEPTH | |
| 179 // x0 = [b0 b1 b7 b6 b8 b9 bF bE] | |
| 180 // x1 = [b4 b5 b3 b2 bC bD bB bA] | |
| 181 in0 = _mm_shuffle_epi32(x0, 0xD8); | |
| 182 in1 = _mm_shuffle_epi32(x1, 0x8D); | |
| 183 // in0 = [b0 b1 b8 b9 b7 b6 bF bE] | |
| 184 // in1 = [b3 b2 bB bA b4 b5 bC bD] | |
| 185 } | |
| 186 { | |
| 187 // vertical DCTs finished. Now we do the horizontal DCTs. | |
| 188 // Stage 3: Add/subtract | |
| 189 | |
| 190 const __m128i t0 = ADD_EPI16(in0, in1); | |
| 191 const __m128i t1 = SUB_EPI16(in0, in1); | |
| 192 // t0 = [c0 c1 c8 c9 c4 c5 cC cD] | |
| 193 // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] | |
| 194 #if DCT_HIGH_BIT_DEPTH | |
| 195 overflow = check_epi16_overflow_x2(&t0, &t1); | |
| 196 if (overflow) { | |
| 197 vp9_highbd_fdct4x4_c(input, output, stride); | |
| 198 return; | |
| 199 } | |
| 200 #endif // DCT_HIGH_BIT_DEPTH | |
| 201 | |
| 202 // Stage 4: multiply by constants (which gets us into 32 bits). | |
| 203 { | |
| 204 // The constants needed here are: | |
| 205 // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] | |
| 206 // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] | |
| 207 // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] | |
| 208 // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] | |
| 209 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); | |
| 210 const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); | |
| 211 const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); | |
| 212 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); | |
| 213 // Then add and right-shift to get back to 16-bit range | |
| 214 // but this combines the final right-shift as well to save operations | |
| 215 // This unusual rounding operations is to maintain bit-accurate | |
| 216 // compatibility with the c version of this function which has two | |
| 217 // rounding steps in a row. | |
| 218 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); | |
| 219 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); | |
| 220 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); | |
| 221 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); | |
| 222 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); | |
| 223 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); | |
| 224 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); | |
| 225 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); | |
| 226 // w0 = [o0 o4 o8 oC] | |
| 227 // w1 = [o2 o6 oA oE] | |
| 228 // w2 = [o1 o5 o9 oD] | |
| 229 // w3 = [o3 o7 oB oF] | |
| 230 // remember the o's are numbered according to the correct output location | |
| 231 const __m128i x0 = _mm_packs_epi32(w0, w1); | |
| 232 const __m128i x1 = _mm_packs_epi32(w2, w3); | |
| 233 #if DCT_HIGH_BIT_DEPTH | |
| 234 overflow = check_epi16_overflow_x2(&x0, &x1); | |
| 235 if (overflow) { | |
| 236 vp9_highbd_fdct4x4_c(input, output, stride); | |
| 237 return; | |
| 238 } | |
| 239 #endif // DCT_HIGH_BIT_DEPTH | |
| 240 { | |
| 241 // x0 = [o0 o4 o8 oC o2 o6 oA oE] | |
| 242 // x1 = [o1 o5 o9 oD o3 o7 oB oF] | |
| 243 const __m128i y0 = _mm_unpacklo_epi16(x0, x1); | |
| 244 const __m128i y1 = _mm_unpackhi_epi16(x0, x1); | |
| 245 // y0 = [o0 o1 o4 o5 o8 o9 oC oD] | |
| 246 // y1 = [o2 o3 o6 o7 oA oB oE oF] | |
| 247 in0 = _mm_unpacklo_epi32(y0, y1); | |
| 248 // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] | |
| 249 in1 = _mm_unpackhi_epi32(y0, y1); | |
| 250 // in1 = [o8 o9 oA oB oC oD oE oF] | |
| 251 } | |
| 252 } | |
| 253 } | |
| 254 // Post-condition (v + 1) >> 2 is now incorporated into previous | |
| 255 // add and right-shift commands. Only 2 store instructions needed | |
| 256 // because we are using the fact that 1/3 are stored just after 0/2. | |
| 257 storeu_output(&in0, output + 0 * 4); | |
| 258 storeu_output(&in1, output + 2 * 4); | |
| 259 } | |
| 260 | |
| 261 | |
| 262 void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { | |
| 263 int pass; | |
| 264 // Constants | |
| 265 // When we use them, in one case, they are all the same. In all others | |
| 266 // it's a pair of them that we need to repeat four times. This is done | |
| 267 // by constructing the 32 bit constant corresponding to that pair. | |
| 268 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | |
| 269 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | |
| 270 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | |
| 271 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | |
| 272 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | |
| 273 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | |
| 274 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | |
| 275 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | |
| 276 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | |
| 277 #if DCT_HIGH_BIT_DEPTH | |
| 278 int overflow; | |
| 279 #endif | |
| 280 // Load input | |
| 281 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); | |
| 282 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); | |
| 283 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); | |
| 284 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); | |
| 285 __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); | |
| 286 __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); | |
| 287 __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); | |
| 288 __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); | |
| 289 // Pre-condition input (shift by two) | |
| 290 in0 = _mm_slli_epi16(in0, 2); | |
| 291 in1 = _mm_slli_epi16(in1, 2); | |
| 292 in2 = _mm_slli_epi16(in2, 2); | |
| 293 in3 = _mm_slli_epi16(in3, 2); | |
| 294 in4 = _mm_slli_epi16(in4, 2); | |
| 295 in5 = _mm_slli_epi16(in5, 2); | |
| 296 in6 = _mm_slli_epi16(in6, 2); | |
| 297 in7 = _mm_slli_epi16(in7, 2); | |
| 298 | |
| 299 // We do two passes, first the columns, then the rows. The results of the | |
| 300 // first pass are transposed so that the same column code can be reused. The | |
| 301 // results of the second pass are also transposed so that the rows (processed | |
| 302 // as columns) are put back in row positions. | |
| 303 for (pass = 0; pass < 2; pass++) { | |
| 304 // To store results of each pass before the transpose. | |
| 305 __m128i res0, res1, res2, res3, res4, res5, res6, res7; | |
| 306 // Add/subtract | |
| 307 const __m128i q0 = ADD_EPI16(in0, in7); | |
| 308 const __m128i q1 = ADD_EPI16(in1, in6); | |
| 309 const __m128i q2 = ADD_EPI16(in2, in5); | |
| 310 const __m128i q3 = ADD_EPI16(in3, in4); | |
| 311 const __m128i q4 = SUB_EPI16(in3, in4); | |
| 312 const __m128i q5 = SUB_EPI16(in2, in5); | |
| 313 const __m128i q6 = SUB_EPI16(in1, in6); | |
| 314 const __m128i q7 = SUB_EPI16(in0, in7); | |
| 315 #if DCT_HIGH_BIT_DEPTH | |
| 316 if (pass == 1) { | |
| 317 overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3, | |
| 318 &q4, &q5, &q6, &q7); | |
| 319 if (overflow) { | |
| 320 vp9_highbd_fdct8x8_c(input, output, stride); | |
| 321 return; | |
| 322 } | |
| 323 } | |
| 324 #endif // DCT_HIGH_BIT_DEPTH | |
| 325 // Work on first four results | |
| 326 { | |
| 327 // Add/subtract | |
| 328 const __m128i r0 = ADD_EPI16(q0, q3); | |
| 329 const __m128i r1 = ADD_EPI16(q1, q2); | |
| 330 const __m128i r2 = SUB_EPI16(q1, q2); | |
| 331 const __m128i r3 = SUB_EPI16(q0, q3); | |
| 332 #if DCT_HIGH_BIT_DEPTH | |
| 333 overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); | |
| 334 if (overflow) { | |
| 335 vp9_highbd_fdct8x8_c(input, output, stride); | |
| 336 return; | |
| 337 } | |
| 338 #endif // DCT_HIGH_BIT_DEPTH | |
| 339 // Interleave to do the multiply by constants which gets us into 32bits | |
| 340 { | |
| 341 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); | |
| 342 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); | |
| 343 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | |
| 344 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); | |
| 345 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); | |
| 346 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); | |
| 347 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); | |
| 348 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); | |
| 349 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); | |
| 350 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); | |
| 351 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); | |
| 352 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); | |
| 353 // dct_const_round_shift | |
| 354 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); | |
| 355 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); | |
| 356 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); | |
| 357 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); | |
| 358 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); | |
| 359 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); | |
| 360 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); | |
| 361 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); | |
| 362 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); | |
| 363 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); | |
| 364 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); | |
| 365 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); | |
| 366 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); | |
| 367 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); | |
| 368 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); | |
| 369 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); | |
| 370 // Combine | |
| 371 res0 = _mm_packs_epi32(w0, w1); | |
| 372 res4 = _mm_packs_epi32(w2, w3); | |
| 373 res2 = _mm_packs_epi32(w4, w5); | |
| 374 res6 = _mm_packs_epi32(w6, w7); | |
| 375 #if DCT_HIGH_BIT_DEPTH | |
| 376 overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); | |
| 377 if (overflow) { | |
| 378 vp9_highbd_fdct8x8_c(input, output, stride); | |
| 379 return; | |
| 380 } | |
| 381 #endif // DCT_HIGH_BIT_DEPTH | |
| 382 } | |
| 383 } | |
| 384 // Work on next four results | |
| 385 { | |
| 386 // Interleave to do the multiply by constants which gets us into 32bits | |
| 387 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); | |
| 388 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); | |
| 389 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); | |
| 390 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); | |
| 391 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); | |
| 392 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); | |
| 393 // dct_const_round_shift | |
| 394 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); | |
| 395 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); | |
| 396 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); | |
| 397 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); | |
| 398 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); | |
| 399 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); | |
| 400 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); | |
| 401 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); | |
| 402 // Combine | |
| 403 const __m128i r0 = _mm_packs_epi32(s0, s1); | |
| 404 const __m128i r1 = _mm_packs_epi32(s2, s3); | |
| 405 #if DCT_HIGH_BIT_DEPTH | |
| 406 overflow = check_epi16_overflow_x2(&r0, &r1); | |
| 407 if (overflow) { | |
| 408 vp9_highbd_fdct8x8_c(input, output, stride); | |
| 409 return; | |
| 410 } | |
| 411 #endif // DCT_HIGH_BIT_DEPTH | |
| 412 { | |
| 413 // Add/subtract | |
| 414 const __m128i x0 = ADD_EPI16(q4, r0); | |
| 415 const __m128i x1 = SUB_EPI16(q4, r0); | |
| 416 const __m128i x2 = SUB_EPI16(q7, r1); | |
| 417 const __m128i x3 = ADD_EPI16(q7, r1); | |
| 418 #if DCT_HIGH_BIT_DEPTH | |
| 419 overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); | |
| 420 if (overflow) { | |
| 421 vp9_highbd_fdct8x8_c(input, output, stride); | |
| 422 return; | |
| 423 } | |
| 424 #endif // DCT_HIGH_BIT_DEPTH | |
| 425 // Interleave to do the multiply by constants which gets us into 32bits | |
| 426 { | |
| 427 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | |
| 428 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | |
| 429 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | |
| 430 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | |
| 431 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); | |
| 432 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); | |
| 433 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); | |
| 434 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); | |
| 435 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); | |
| 436 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); | |
| 437 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); | |
| 438 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); | |
| 439 // dct_const_round_shift | |
| 440 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); | |
| 441 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); | |
| 442 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); | |
| 443 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); | |
| 444 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); | |
| 445 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); | |
| 446 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); | |
| 447 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); | |
| 448 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); | |
| 449 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); | |
| 450 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); | |
| 451 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); | |
| 452 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); | |
| 453 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); | |
| 454 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); | |
| 455 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); | |
| 456 // Combine | |
| 457 res1 = _mm_packs_epi32(w0, w1); | |
| 458 res7 = _mm_packs_epi32(w2, w3); | |
| 459 res5 = _mm_packs_epi32(w4, w5); | |
| 460 res3 = _mm_packs_epi32(w6, w7); | |
| 461 #if DCT_HIGH_BIT_DEPTH | |
| 462 overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); | |
| 463 if (overflow) { | |
| 464 vp9_highbd_fdct8x8_c(input, output, stride); | |
| 465 return; | |
| 466 } | |
| 467 #endif // DCT_HIGH_BIT_DEPTH | |
| 468 } | |
| 469 } | |
| 470 } | |
| 471 // Transpose the 8x8. | |
| 472 { | |
| 473 // 00 01 02 03 04 05 06 07 | |
| 474 // 10 11 12 13 14 15 16 17 | |
| 475 // 20 21 22 23 24 25 26 27 | |
| 476 // 30 31 32 33 34 35 36 37 | |
| 477 // 40 41 42 43 44 45 46 47 | |
| 478 // 50 51 52 53 54 55 56 57 | |
| 479 // 60 61 62 63 64 65 66 67 | |
| 480 // 70 71 72 73 74 75 76 77 | |
| 481 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); | |
| 482 const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); | |
| 483 const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); | |
| 484 const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); | |
| 485 const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); | |
| 486 const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); | |
| 487 const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); | |
| 488 const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); | |
| 489 // 00 10 01 11 02 12 03 13 | |
| 490 // 20 30 21 31 22 32 23 33 | |
| 491 // 04 14 05 15 06 16 07 17 | |
| 492 // 24 34 25 35 26 36 27 37 | |
| 493 // 40 50 41 51 42 52 43 53 | |
| 494 // 60 70 61 71 62 72 63 73 | |
| 495 // 54 54 55 55 56 56 57 57 | |
| 496 // 64 74 65 75 66 76 67 77 | |
| 497 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); | |
| 498 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); | |
| 499 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); | |
| 500 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); | |
| 501 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); | |
| 502 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); | |
| 503 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); | |
| 504 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); | |
| 505 // 00 10 20 30 01 11 21 31 | |
| 506 // 40 50 60 70 41 51 61 71 | |
| 507 // 02 12 22 32 03 13 23 33 | |
| 508 // 42 52 62 72 43 53 63 73 | |
| 509 // 04 14 24 34 05 15 21 36 | |
| 510 // 44 54 64 74 45 55 61 76 | |
| 511 // 06 16 26 36 07 17 27 37 | |
| 512 // 46 56 66 76 47 57 67 77 | |
| 513 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); | |
| 514 in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); | |
| 515 in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); | |
| 516 in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); | |
| 517 in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); | |
| 518 in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); | |
| 519 in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); | |
| 520 in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); | |
| 521 // 00 10 20 30 40 50 60 70 | |
| 522 // 01 11 21 31 41 51 61 71 | |
| 523 // 02 12 22 32 42 52 62 72 | |
| 524 // 03 13 23 33 43 53 63 73 | |
| 525 // 04 14 24 34 44 54 64 74 | |
| 526 // 05 15 25 35 45 55 65 75 | |
| 527 // 06 16 26 36 46 56 66 76 | |
| 528 // 07 17 27 37 47 57 67 77 | |
| 529 } | |
| 530 } | |
| 531 // Post-condition output and store it | |
| 532 { | |
| 533 // Post-condition (division by two) | |
| 534 // division of two 16 bits signed numbers using shifts | |
| 535 // n / 2 = (n - (n >> 15)) >> 1 | |
| 536 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); | |
| 537 const __m128i sign_in1 = _mm_srai_epi16(in1, 15); | |
| 538 const __m128i sign_in2 = _mm_srai_epi16(in2, 15); | |
| 539 const __m128i sign_in3 = _mm_srai_epi16(in3, 15); | |
| 540 const __m128i sign_in4 = _mm_srai_epi16(in4, 15); | |
| 541 const __m128i sign_in5 = _mm_srai_epi16(in5, 15); | |
| 542 const __m128i sign_in6 = _mm_srai_epi16(in6, 15); | |
| 543 const __m128i sign_in7 = _mm_srai_epi16(in7, 15); | |
| 544 in0 = _mm_sub_epi16(in0, sign_in0); | |
| 545 in1 = _mm_sub_epi16(in1, sign_in1); | |
| 546 in2 = _mm_sub_epi16(in2, sign_in2); | |
| 547 in3 = _mm_sub_epi16(in3, sign_in3); | |
| 548 in4 = _mm_sub_epi16(in4, sign_in4); | |
| 549 in5 = _mm_sub_epi16(in5, sign_in5); | |
| 550 in6 = _mm_sub_epi16(in6, sign_in6); | |
| 551 in7 = _mm_sub_epi16(in7, sign_in7); | |
| 552 in0 = _mm_srai_epi16(in0, 1); | |
| 553 in1 = _mm_srai_epi16(in1, 1); | |
| 554 in2 = _mm_srai_epi16(in2, 1); | |
| 555 in3 = _mm_srai_epi16(in3, 1); | |
| 556 in4 = _mm_srai_epi16(in4, 1); | |
| 557 in5 = _mm_srai_epi16(in5, 1); | |
| 558 in6 = _mm_srai_epi16(in6, 1); | |
| 559 in7 = _mm_srai_epi16(in7, 1); | |
| 560 // store results | |
| 561 store_output(&in0, (output + 0 * 8)); | |
| 562 store_output(&in1, (output + 1 * 8)); | |
| 563 store_output(&in2, (output + 2 * 8)); | |
| 564 store_output(&in3, (output + 3 * 8)); | |
| 565 store_output(&in4, (output + 4 * 8)); | |
| 566 store_output(&in5, (output + 5 * 8)); | |
| 567 store_output(&in6, (output + 6 * 8)); | |
| 568 store_output(&in7, (output + 7 * 8)); | |
| 569 } | |
| 570 } | |
| 571 | |
| 572 void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { | |
| 573 // The 2D transform is done with two passes which are actually pretty | |
| 574 // similar. In the first one, we transform the columns and transpose | |
| 575 // the results. In the second one, we transform the rows. To achieve that, | |
| 576 // as the first pass results are transposed, we transpose the columns (that | |
| 577 // is the transposed rows) and transpose the results (so that it goes back | |
| 578 // in normal/row positions). | |
| 579 int pass; | |
| 580 // We need an intermediate buffer between passes. | |
| 581 DECLARE_ALIGNED(16, int16_t, intermediate[256]); | |
| 582 const int16_t *in = input; | |
| 583 int16_t *out0 = intermediate; | |
| 584 tran_low_t *out1 = output; | |
| 585 // Constants | |
| 586 // When we use them, in one case, they are all the same. In all others | |
| 587 // it's a pair of them that we need to repeat four times. This is done | |
| 588 // by constructing the 32 bit constant corresponding to that pair. | |
| 589 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | |
| 590 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | |
| 591 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | |
| 592 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); | |
| 593 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | |
| 594 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | |
| 595 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | |
| 596 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); | |
| 597 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | |
| 598 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); | |
| 599 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); | |
| 600 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); | |
| 601 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); | |
| 602 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); | |
| 603 const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); | |
| 604 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); | |
| 605 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); | |
| 606 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | |
| 607 const __m128i kOne = _mm_set1_epi16(1); | |
| 608 // Do the two transform/transpose passes | |
| 609 for (pass = 0; pass < 2; ++pass) { | |
| 610 // We process eight columns (transposed rows in second pass) at a time. | |
| 611 int column_start; | |
| 612 #if DCT_HIGH_BIT_DEPTH | |
| 613 int overflow; | |
| 614 #endif | |
| 615 for (column_start = 0; column_start < 16; column_start += 8) { | |
| 616 __m128i in00, in01, in02, in03, in04, in05, in06, in07; | |
| 617 __m128i in08, in09, in10, in11, in12, in13, in14, in15; | |
| 618 __m128i input0, input1, input2, input3, input4, input5, input6, input7; | |
| 619 __m128i step1_0, step1_1, step1_2, step1_3; | |
| 620 __m128i step1_4, step1_5, step1_6, step1_7; | |
| 621 __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; | |
| 622 __m128i step3_0, step3_1, step3_2, step3_3; | |
| 623 __m128i step3_4, step3_5, step3_6, step3_7; | |
| 624 __m128i res00, res01, res02, res03, res04, res05, res06, res07; | |
| 625 __m128i res08, res09, res10, res11, res12, res13, res14, res15; | |
| 626 // Load and pre-condition input. | |
| 627 if (0 == pass) { | |
| 628 in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); | |
| 629 in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); | |
| 630 in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); | |
| 631 in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); | |
| 632 in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); | |
| 633 in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); | |
| 634 in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); | |
| 635 in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); | |
| 636 in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); | |
| 637 in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); | |
| 638 in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); | |
| 639 in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); | |
| 640 in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); | |
| 641 in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); | |
| 642 in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); | |
| 643 in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); | |
| 644 // x = x << 2 | |
| 645 in00 = _mm_slli_epi16(in00, 2); | |
| 646 in01 = _mm_slli_epi16(in01, 2); | |
| 647 in02 = _mm_slli_epi16(in02, 2); | |
| 648 in03 = _mm_slli_epi16(in03, 2); | |
| 649 in04 = _mm_slli_epi16(in04, 2); | |
| 650 in05 = _mm_slli_epi16(in05, 2); | |
| 651 in06 = _mm_slli_epi16(in06, 2); | |
| 652 in07 = _mm_slli_epi16(in07, 2); | |
| 653 in08 = _mm_slli_epi16(in08, 2); | |
| 654 in09 = _mm_slli_epi16(in09, 2); | |
| 655 in10 = _mm_slli_epi16(in10, 2); | |
| 656 in11 = _mm_slli_epi16(in11, 2); | |
| 657 in12 = _mm_slli_epi16(in12, 2); | |
| 658 in13 = _mm_slli_epi16(in13, 2); | |
| 659 in14 = _mm_slli_epi16(in14, 2); | |
| 660 in15 = _mm_slli_epi16(in15, 2); | |
| 661 } else { | |
| 662 in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); | |
| 663 in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); | |
| 664 in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); | |
| 665 in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); | |
| 666 in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); | |
| 667 in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); | |
| 668 in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); | |
| 669 in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); | |
| 670 in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); | |
| 671 in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); | |
| 672 in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); | |
| 673 in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); | |
| 674 in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); | |
| 675 in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); | |
| 676 in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); | |
| 677 in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); | |
| 678 // x = (x + 1) >> 2 | |
| 679 in00 = _mm_add_epi16(in00, kOne); | |
| 680 in01 = _mm_add_epi16(in01, kOne); | |
| 681 in02 = _mm_add_epi16(in02, kOne); | |
| 682 in03 = _mm_add_epi16(in03, kOne); | |
| 683 in04 = _mm_add_epi16(in04, kOne); | |
| 684 in05 = _mm_add_epi16(in05, kOne); | |
| 685 in06 = _mm_add_epi16(in06, kOne); | |
| 686 in07 = _mm_add_epi16(in07, kOne); | |
| 687 in08 = _mm_add_epi16(in08, kOne); | |
| 688 in09 = _mm_add_epi16(in09, kOne); | |
| 689 in10 = _mm_add_epi16(in10, kOne); | |
| 690 in11 = _mm_add_epi16(in11, kOne); | |
| 691 in12 = _mm_add_epi16(in12, kOne); | |
| 692 in13 = _mm_add_epi16(in13, kOne); | |
| 693 in14 = _mm_add_epi16(in14, kOne); | |
| 694 in15 = _mm_add_epi16(in15, kOne); | |
| 695 in00 = _mm_srai_epi16(in00, 2); | |
| 696 in01 = _mm_srai_epi16(in01, 2); | |
| 697 in02 = _mm_srai_epi16(in02, 2); | |
| 698 in03 = _mm_srai_epi16(in03, 2); | |
| 699 in04 = _mm_srai_epi16(in04, 2); | |
| 700 in05 = _mm_srai_epi16(in05, 2); | |
| 701 in06 = _mm_srai_epi16(in06, 2); | |
| 702 in07 = _mm_srai_epi16(in07, 2); | |
| 703 in08 = _mm_srai_epi16(in08, 2); | |
| 704 in09 = _mm_srai_epi16(in09, 2); | |
| 705 in10 = _mm_srai_epi16(in10, 2); | |
| 706 in11 = _mm_srai_epi16(in11, 2); | |
| 707 in12 = _mm_srai_epi16(in12, 2); | |
| 708 in13 = _mm_srai_epi16(in13, 2); | |
| 709 in14 = _mm_srai_epi16(in14, 2); | |
| 710 in15 = _mm_srai_epi16(in15, 2); | |
| 711 } | |
| 712 in += 8; | |
| 713 // Calculate input for the first 8 results. | |
| 714 { | |
| 715 input0 = ADD_EPI16(in00, in15); | |
| 716 input1 = ADD_EPI16(in01, in14); | |
| 717 input2 = ADD_EPI16(in02, in13); | |
| 718 input3 = ADD_EPI16(in03, in12); | |
| 719 input4 = ADD_EPI16(in04, in11); | |
| 720 input5 = ADD_EPI16(in05, in10); | |
| 721 input6 = ADD_EPI16(in06, in09); | |
| 722 input7 = ADD_EPI16(in07, in08); | |
| 723 #if DCT_HIGH_BIT_DEPTH | |
| 724 overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3, | |
| 725 &input4, &input5, &input6, &input7); | |
| 726 if (overflow) { | |
| 727 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 728 return; | |
| 729 } | |
| 730 #endif // DCT_HIGH_BIT_DEPTH | |
| 731 } | |
| 732 // Calculate input for the next 8 results. | |
| 733 { | |
| 734 step1_0 = SUB_EPI16(in07, in08); | |
| 735 step1_1 = SUB_EPI16(in06, in09); | |
| 736 step1_2 = SUB_EPI16(in05, in10); | |
| 737 step1_3 = SUB_EPI16(in04, in11); | |
| 738 step1_4 = SUB_EPI16(in03, in12); | |
| 739 step1_5 = SUB_EPI16(in02, in13); | |
| 740 step1_6 = SUB_EPI16(in01, in14); | |
| 741 step1_7 = SUB_EPI16(in00, in15); | |
| 742 #if DCT_HIGH_BIT_DEPTH | |
| 743 overflow = check_epi16_overflow_x8(&step1_0, &step1_1, | |
| 744 &step1_2, &step1_3, | |
| 745 &step1_4, &step1_5, | |
| 746 &step1_6, &step1_7); | |
| 747 if (overflow) { | |
| 748 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 749 return; | |
| 750 } | |
| 751 #endif // DCT_HIGH_BIT_DEPTH | |
| 752 } | |
| 753 // Work on the first eight values; fdct8(input, even_results); | |
| 754 { | |
| 755 // Add/subtract | |
| 756 const __m128i q0 = ADD_EPI16(input0, input7); | |
| 757 const __m128i q1 = ADD_EPI16(input1, input6); | |
| 758 const __m128i q2 = ADD_EPI16(input2, input5); | |
| 759 const __m128i q3 = ADD_EPI16(input3, input4); | |
| 760 const __m128i q4 = SUB_EPI16(input3, input4); | |
| 761 const __m128i q5 = SUB_EPI16(input2, input5); | |
| 762 const __m128i q6 = SUB_EPI16(input1, input6); | |
| 763 const __m128i q7 = SUB_EPI16(input0, input7); | |
| 764 #if DCT_HIGH_BIT_DEPTH | |
| 765 overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3, | |
| 766 &q4, &q5, &q6, &q7); | |
| 767 if (overflow) { | |
| 768 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 769 return; | |
| 770 } | |
| 771 #endif // DCT_HIGH_BIT_DEPTH | |
| 772 // Work on first four results | |
| 773 { | |
| 774 // Add/subtract | |
| 775 const __m128i r0 = ADD_EPI16(q0, q3); | |
| 776 const __m128i r1 = ADD_EPI16(q1, q2); | |
| 777 const __m128i r2 = SUB_EPI16(q1, q2); | |
| 778 const __m128i r3 = SUB_EPI16(q0, q3); | |
| 779 #if DCT_HIGH_BIT_DEPTH | |
| 780 overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); | |
| 781 if (overflow) { | |
| 782 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 783 return; | |
| 784 } | |
| 785 #endif // DCT_HIGH_BIT_DEPTH | |
| 786 // Interleave to do the multiply by constants which gets us | |
| 787 // into 32 bits. | |
| 788 { | |
| 789 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); | |
| 790 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); | |
| 791 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | |
| 792 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); | |
| 793 res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, | |
| 794 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 795 res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, | |
| 796 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 797 res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, | |
| 798 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 799 res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24, | |
| 800 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 801 #if DCT_HIGH_BIT_DEPTH | |
| 802 overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12); | |
| 803 if (overflow) { | |
| 804 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 805 return; | |
| 806 } | |
| 807 #endif // DCT_HIGH_BIT_DEPTH | |
| 808 } | |
| 809 } | |
| 810 // Work on next four results | |
| 811 { | |
| 812 // Interleave to do the multiply by constants which gets us | |
| 813 // into 32 bits. | |
| 814 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); | |
| 815 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); | |
| 816 const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16, | |
| 817 &k__DCT_CONST_ROUNDING, | |
| 818 DCT_CONST_BITS); | |
| 819 const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16, | |
| 820 &k__DCT_CONST_ROUNDING, | |
| 821 DCT_CONST_BITS); | |
| 822 #if DCT_HIGH_BIT_DEPTH | |
| 823 overflow = check_epi16_overflow_x2(&r0, &r1); | |
| 824 if (overflow) { | |
| 825 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 826 return; | |
| 827 } | |
| 828 #endif // DCT_HIGH_BIT_DEPTH | |
| 829 { | |
| 830 // Add/subtract | |
| 831 const __m128i x0 = ADD_EPI16(q4, r0); | |
| 832 const __m128i x1 = SUB_EPI16(q4, r0); | |
| 833 const __m128i x2 = SUB_EPI16(q7, r1); | |
| 834 const __m128i x3 = ADD_EPI16(q7, r1); | |
| 835 #if DCT_HIGH_BIT_DEPTH | |
| 836 overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); | |
| 837 if (overflow) { | |
| 838 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 839 return; | |
| 840 } | |
| 841 #endif // DCT_HIGH_BIT_DEPTH | |
| 842 // Interleave to do the multiply by constants which gets us | |
| 843 // into 32 bits. | |
| 844 { | |
| 845 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | |
| 846 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | |
| 847 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | |
| 848 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | |
| 849 res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04, | |
| 850 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 851 res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28, | |
| 852 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 853 res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20, | |
| 854 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 855 res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12, | |
| 856 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 857 #if DCT_HIGH_BIT_DEPTH | |
| 858 overflow = check_epi16_overflow_x4(&res02, &res14, | |
| 859 &res10, &res06); | |
| 860 if (overflow) { | |
| 861 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 862 return; | |
| 863 } | |
| 864 #endif // DCT_HIGH_BIT_DEPTH | |
| 865 } | |
| 866 } | |
| 867 } | |
| 868 } | |
| 869 // Work on the next eight values; step1 -> odd_results | |
| 870 { | |
| 871 // step 2 | |
| 872 { | |
| 873 const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); | |
| 874 const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); | |
| 875 const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); | |
| 876 const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); | |
| 877 step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, | |
| 878 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 879 step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16, | |
| 880 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 881 step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, | |
| 882 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 883 step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16, | |
| 884 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 885 #if DCT_HIGH_BIT_DEPTH | |
| 886 overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, | |
| 887 &step2_4); | |
| 888 if (overflow) { | |
| 889 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 890 return; | |
| 891 } | |
| 892 #endif // DCT_HIGH_BIT_DEPTH | |
| 893 } | |
| 894 // step 3 | |
| 895 { | |
| 896 step3_0 = ADD_EPI16(step1_0, step2_3); | |
| 897 step3_1 = ADD_EPI16(step1_1, step2_2); | |
| 898 step3_2 = SUB_EPI16(step1_1, step2_2); | |
| 899 step3_3 = SUB_EPI16(step1_0, step2_3); | |
| 900 step3_4 = SUB_EPI16(step1_7, step2_4); | |
| 901 step3_5 = SUB_EPI16(step1_6, step2_5); | |
| 902 step3_6 = ADD_EPI16(step1_6, step2_5); | |
| 903 step3_7 = ADD_EPI16(step1_7, step2_4); | |
| 904 #if DCT_HIGH_BIT_DEPTH | |
| 905 overflow = check_epi16_overflow_x8(&step3_0, &step3_1, | |
| 906 &step3_2, &step3_3, | |
| 907 &step3_4, &step3_5, | |
| 908 &step3_6, &step3_7); | |
| 909 if (overflow) { | |
| 910 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 911 return; | |
| 912 } | |
| 913 #endif // DCT_HIGH_BIT_DEPTH | |
| 914 } | |
| 915 // step 4 | |
| 916 { | |
| 917 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); | |
| 918 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); | |
| 919 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); | |
| 920 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); | |
| 921 step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24, | |
| 922 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 923 step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, | |
| 924 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 925 step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08, | |
| 926 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 927 step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24, | |
| 928 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 929 #if DCT_HIGH_BIT_DEPTH | |
| 930 overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, | |
| 931 &step2_5); | |
| 932 if (overflow) { | |
| 933 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 934 return; | |
| 935 } | |
| 936 #endif // DCT_HIGH_BIT_DEPTH | |
| 937 } | |
| 938 // step 5 | |
| 939 { | |
| 940 step1_0 = ADD_EPI16(step3_0, step2_1); | |
| 941 step1_1 = SUB_EPI16(step3_0, step2_1); | |
| 942 step1_2 = ADD_EPI16(step3_3, step2_2); | |
| 943 step1_3 = SUB_EPI16(step3_3, step2_2); | |
| 944 step1_4 = SUB_EPI16(step3_4, step2_5); | |
| 945 step1_5 = ADD_EPI16(step3_4, step2_5); | |
| 946 step1_6 = SUB_EPI16(step3_7, step2_6); | |
| 947 step1_7 = ADD_EPI16(step3_7, step2_6); | |
| 948 #if DCT_HIGH_BIT_DEPTH | |
| 949 overflow = check_epi16_overflow_x8(&step1_0, &step1_1, | |
| 950 &step1_2, &step1_3, | |
| 951 &step1_4, &step1_5, | |
| 952 &step1_6, &step1_7); | |
| 953 if (overflow) { | |
| 954 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 955 return; | |
| 956 } | |
| 957 #endif // DCT_HIGH_BIT_DEPTH | |
| 958 } | |
| 959 // step 6 | |
| 960 { | |
| 961 const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); | |
| 962 const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); | |
| 963 const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); | |
| 964 const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); | |
| 965 res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02, | |
| 966 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 967 res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18, | |
| 968 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 969 res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30, | |
| 970 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 971 res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14, | |
| 972 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 973 #if DCT_HIGH_BIT_DEPTH | |
| 974 overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07); | |
| 975 if (overflow) { | |
| 976 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 977 return; | |
| 978 } | |
| 979 #endif // DCT_HIGH_BIT_DEPTH | |
| 980 } | |
| 981 { | |
| 982 const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); | |
| 983 const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); | |
| 984 const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); | |
| 985 const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); | |
| 986 res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10, | |
| 987 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 988 res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26, | |
| 989 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 990 res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22, | |
| 991 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 992 res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06, | |
| 993 &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); | |
| 994 #if DCT_HIGH_BIT_DEPTH | |
| 995 overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03); | |
| 996 if (overflow) { | |
| 997 vp9_highbd_fdct16x16_c(input, output, stride); | |
| 998 return; | |
| 999 } | |
| 1000 #endif // DCT_HIGH_BIT_DEPTH | |
| 1001 } | |
| 1002 } | |
| 1003 // Transpose the results, do it as two 8x8 transposes. | |
| 1004 transpose_and_output8x8(&res00, &res01, &res02, &res03, | |
| 1005 &res04, &res05, &res06, &res07, | |
| 1006 pass, out0, out1); | |
| 1007 transpose_and_output8x8(&res08, &res09, &res10, &res11, | |
| 1008 &res12, &res13, &res14, &res15, | |
| 1009 pass, out0 + 8, out1 + 8); | |
| 1010 if (pass == 0) { | |
| 1011 out0 += 8*16; | |
| 1012 } else { | |
| 1013 out1 += 8*16; | |
| 1014 } | |
| 1015 } | |
| 1016 // Setup in/out for next pass. | |
| 1017 in = intermediate; | |
| 1018 } | |
| 1019 } | |
| 1020 | |
| 1021 #undef ADD_EPI16 | |
| 1022 #undef SUB_EPI16 | |
| OLD | NEW |