| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <emmintrin.h> // SSE2 | 11 #include <emmintrin.h> // SSE2 |
| 12 #include "vp9/common/vp9_idct.h" // for cospi constants | 12 #include "vp9/common/vp9_idct.h" // for cospi constants |
| 13 #include "vpx_ports/mem.h" | 13 #include "vpx_ports/mem.h" |
| 14 | 14 |
| 15 void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { | 15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { |
| 16 // The 2D transform is done with two passes which are actually pretty | 16 // The 2D transform is done with two passes which are actually pretty |
| 17 // similar. In the first one, we transform the columns and transpose | 17 // similar. In the first one, we transform the columns and transpose |
| 18 // the results. In the second one, we transform the rows. To achieve that, | 18 // the results. In the second one, we transform the rows. To achieve that, |
| 19 // as the first pass results are transposed, we tranpose the columns (that | 19 // as the first pass results are transposed, we tranpose the columns (that |
| 20 // is the transposed rows) and transpose the results (so that it goes back | 20 // is the transposed rows) and transpose the results (so that it goes back |
| 21 // in normal/row positions). | 21 // in normal/row positions). |
| 22 const int stride = pitch >> 1; | |
| 23 int pass; | 22 int pass; |
| 24 // Constants | 23 // Constants |
| 25 // When we use them, in one case, they are all the same. In all others | 24 // When we use them, in one case, they are all the same. In all others |
| 26 // it's a pair of them that we need to repeat four times. This is done | 25 // it's a pair of them that we need to repeat four times. This is done |
| 27 // by constructing the 32 bit constant corresponding to that pair. | 26 // by constructing the 32 bit constant corresponding to that pair. |
| 28 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
| 29 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 30 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
| 31 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
| 32 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| (...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 105 __m128i out01 = _mm_add_epi16(in0, kOne); | 104 __m128i out01 = _mm_add_epi16(in0, kOne); |
| 106 __m128i out23 = _mm_add_epi16(in2, kOne); | 105 __m128i out23 = _mm_add_epi16(in2, kOne); |
| 107 out01 = _mm_srai_epi16(out01, 2); | 106 out01 = _mm_srai_epi16(out01, 2); |
| 108 out23 = _mm_srai_epi16(out23, 2); | 107 out23 = _mm_srai_epi16(out23, 2); |
| 109 _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); | 108 _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); |
| 110 _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); | 109 _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); |
| 111 } | 110 } |
| 112 } | 111 } |
| 113 } | 112 } |
| 114 | 113 |
| 115 void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) { | 114 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, |
| 116 vp9_short_fdct4x4_sse2(input, output, pitch); | 115 int stride) { |
| 117 vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch); | |
| 118 } | |
| 119 | |
| 120 static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) { | |
| 121 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); | 116 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); |
| 122 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); | 117 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); |
| 123 __m128i mask; | 118 __m128i mask; |
| 124 | 119 |
| 125 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); | 120 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); |
| 126 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); | 121 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); |
| 127 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); | 122 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); |
| 128 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); | 123 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); |
| 129 | 124 |
| 130 in[0] = _mm_slli_epi16(in[0], 4); | 125 in[0] = _mm_slli_epi16(in[0], 4); |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 164 // 00 10 20 30 01 11 21 31 | 159 // 00 10 20 30 01 11 21 31 |
| 165 // 02 12 22 32 03 13 23 33 | 160 // 02 12 22 32 03 13 23 33 |
| 166 // only use the first 4 16-bit integers | 161 // only use the first 4 16-bit integers |
| 167 res[1] = _mm_unpackhi_epi64(res[0], res[0]); | 162 res[1] = _mm_unpackhi_epi64(res[0], res[0]); |
| 168 res[3] = _mm_unpackhi_epi64(res[2], res[2]); | 163 res[3] = _mm_unpackhi_epi64(res[2], res[2]); |
| 169 } | 164 } |
| 170 | 165 |
| 171 void fdct4_1d_sse2(__m128i *in) { | 166 void fdct4_1d_sse2(__m128i *in) { |
| 172 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 167 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
| 173 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 168 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 174 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 169 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
| 175 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 170 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
| 176 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); | 171 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 177 | 172 |
| 178 __m128i u[4], v[4]; | 173 __m128i u[4], v[4]; |
| 179 u[0] = _mm_add_epi16(in[0], in[3]); | 174 u[0]=_mm_unpacklo_epi16(in[0], in[1]); |
| 180 u[1] = _mm_add_epi16(in[1], in[2]); | 175 u[1]=_mm_unpacklo_epi16(in[3], in[2]); |
| 181 u[2] = _mm_sub_epi16(in[1], in[2]); | |
| 182 u[3] = _mm_sub_epi16(in[0], in[3]); | |
| 183 | 176 |
| 184 v[0] = _mm_unpacklo_epi16(u[0], u[1]); | 177 v[0] = _mm_add_epi16(u[0], u[1]); |
| 185 v[1] = _mm_unpacklo_epi16(u[2], u[3]); | 178 v[1] = _mm_sub_epi16(u[0], u[1]); |
| 179 |
| 186 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 | 180 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 |
| 187 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 | 181 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 |
| 188 u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1 | 182 u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 |
| 189 u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3 | 183 u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 |
| 190 | 184 |
| 191 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); | 185 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
| 192 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); | 186 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
| 193 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); | 187 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
| 194 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); | 188 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
| 195 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); | 189 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
| 196 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 190 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
| 197 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 191 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
| 198 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 192 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
| 199 | 193 |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 242 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); | 236 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
| 243 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); | 237 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
| 244 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); | 238 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
| 245 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); | 239 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
| 246 | 240 |
| 247 in[0] = _mm_packs_epi32(u[0], u[2]); | 241 in[0] = _mm_packs_epi32(u[0], u[2]); |
| 248 in[1] = _mm_packs_epi32(u[1], u[3]); | 242 in[1] = _mm_packs_epi32(u[1], u[3]); |
| 249 transpose_4x4(in); | 243 transpose_4x4(in); |
| 250 } | 244 } |
| 251 | 245 |
| 252 void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, | 246 void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, |
| 253 int stride, int tx_type) { | 247 int stride, int tx_type) { |
| 254 __m128i in[4]; | 248 __m128i in[4]; |
| 255 load_buffer_4x4(input, in, stride); | 249 load_buffer_4x4(input, in, stride); |
| 256 switch (tx_type) { | 250 switch (tx_type) { |
| 257 case 0: // DCT_DCT | 251 case 0: // DCT_DCT |
| 258 fdct4_1d_sse2(in); | 252 fdct4_1d_sse2(in); |
| 259 fdct4_1d_sse2(in); | 253 fdct4_1d_sse2(in); |
| 260 break; | 254 break; |
| 261 case 1: // ADST_DCT | 255 case 1: // ADST_DCT |
| 262 fadst4_1d_sse2(in); | 256 fadst4_1d_sse2(in); |
| 263 fdct4_1d_sse2(in); | 257 fdct4_1d_sse2(in); |
| 264 break; | 258 break; |
| 265 case 2: // DCT_ADST | 259 case 2: // DCT_ADST |
| 266 fdct4_1d_sse2(in); | 260 fdct4_1d_sse2(in); |
| 267 fadst4_1d_sse2(in); | 261 fadst4_1d_sse2(in); |
| 268 break; | 262 break; |
| 269 case 3: // ADST_ADST | 263 case 3: // ADST_ADST |
| 270 fadst4_1d_sse2(in); | 264 fadst4_1d_sse2(in); |
| 271 fadst4_1d_sse2(in); | 265 fadst4_1d_sse2(in); |
| 272 break; | 266 break; |
| 273 default: | 267 default: |
| 274 assert(0); | 268 assert(0); |
| 275 break; | 269 break; |
| 276 } | 270 } |
| 277 write_buffer_4x4(output, in); | 271 write_buffer_4x4(output, in); |
| 278 } | 272 } |
| 279 | 273 |
| 280 void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { | 274 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { |
| 281 const int stride = pitch >> 1; | |
| 282 int pass; | 275 int pass; |
| 283 // Constants | 276 // Constants |
| 284 // When we use them, in one case, they are all the same. In all others | 277 // When we use them, in one case, they are all the same. In all others |
| 285 // it's a pair of them that we need to repeat four times. This is done | 278 // it's a pair of them that we need to repeat four times. This is done |
| 286 // by constructing the 32 bit constant corresponding to that pair. | 279 // by constructing the 32 bit constant corresponding to that pair. |
| 287 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 280 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
| 288 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 281 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 289 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 282 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
| 290 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 283 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
| 291 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); | 284 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); |
| (...skipping 236 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 528 _mm_store_si128((__m128i *)(output + 2 * 8), in2); | 521 _mm_store_si128((__m128i *)(output + 2 * 8), in2); |
| 529 _mm_store_si128((__m128i *)(output + 3 * 8), in3); | 522 _mm_store_si128((__m128i *)(output + 3 * 8), in3); |
| 530 _mm_store_si128((__m128i *)(output + 4 * 8), in4); | 523 _mm_store_si128((__m128i *)(output + 4 * 8), in4); |
| 531 _mm_store_si128((__m128i *)(output + 5 * 8), in5); | 524 _mm_store_si128((__m128i *)(output + 5 * 8), in5); |
| 532 _mm_store_si128((__m128i *)(output + 6 * 8), in6); | 525 _mm_store_si128((__m128i *)(output + 6 * 8), in6); |
| 533 _mm_store_si128((__m128i *)(output + 7 * 8), in7); | 526 _mm_store_si128((__m128i *)(output + 7 * 8), in7); |
| 534 } | 527 } |
| 535 } | 528 } |
| 536 | 529 |
| 537 // load 8x8 array | 530 // load 8x8 array |
| 538 static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) { | 531 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, |
| 539 in[0] = _mm_load_si128((__m128i *)(input + 0 * stride)); | 532 int stride) { |
| 540 in[1] = _mm_load_si128((__m128i *)(input + 1 * stride)); | 533 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); |
| 541 in[2] = _mm_load_si128((__m128i *)(input + 2 * stride)); | 534 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); |
| 542 in[3] = _mm_load_si128((__m128i *)(input + 3 * stride)); | 535 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); |
| 543 in[4] = _mm_load_si128((__m128i *)(input + 4 * stride)); | 536 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); |
| 544 in[5] = _mm_load_si128((__m128i *)(input + 5 * stride)); | 537 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); |
| 545 in[6] = _mm_load_si128((__m128i *)(input + 6 * stride)); | 538 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); |
| 546 in[7] = _mm_load_si128((__m128i *)(input + 7 * stride)); | 539 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); |
| 540 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); |
| 547 | 541 |
| 548 in[0] = _mm_slli_epi16(in[0], 2); | 542 in[0] = _mm_slli_epi16(in[0], 2); |
| 549 in[1] = _mm_slli_epi16(in[1], 2); | 543 in[1] = _mm_slli_epi16(in[1], 2); |
| 550 in[2] = _mm_slli_epi16(in[2], 2); | 544 in[2] = _mm_slli_epi16(in[2], 2); |
| 551 in[3] = _mm_slli_epi16(in[3], 2); | 545 in[3] = _mm_slli_epi16(in[3], 2); |
| 552 in[4] = _mm_slli_epi16(in[4], 2); | 546 in[4] = _mm_slli_epi16(in[4], 2); |
| 553 in[5] = _mm_slli_epi16(in[5], 2); | 547 in[5] = _mm_slli_epi16(in[5], 2); |
| 554 in[6] = _mm_slli_epi16(in[6], 2); | 548 in[6] = _mm_slli_epi16(in[6], 2); |
| 555 in[7] = _mm_slli_epi16(in[7], 2); | 549 in[7] = _mm_slli_epi16(in[7], 2); |
| 556 } | 550 } |
| (...skipping 469 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1026 in[3] = _mm_sub_epi16(k__const_0, s2); | 1020 in[3] = _mm_sub_epi16(k__const_0, s2); |
| 1027 in[4] = s3; | 1021 in[4] = s3; |
| 1028 in[5] = _mm_sub_epi16(k__const_0, s7); | 1022 in[5] = _mm_sub_epi16(k__const_0, s7); |
| 1029 in[6] = s5; | 1023 in[6] = s5; |
| 1030 in[7] = _mm_sub_epi16(k__const_0, s1); | 1024 in[7] = _mm_sub_epi16(k__const_0, s1); |
| 1031 | 1025 |
| 1032 // transpose | 1026 // transpose |
| 1033 array_transpose_8x8(in, in); | 1027 array_transpose_8x8(in, in); |
| 1034 } | 1028 } |
| 1035 | 1029 |
| 1036 void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, | 1030 void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, |
| 1037 int stride, int tx_type) { | 1031 int stride, int tx_type) { |
| 1038 __m128i in[8]; | 1032 __m128i in[8]; |
| 1039 load_buffer_8x8(input, in, stride); | 1033 load_buffer_8x8(input, in, stride); |
| 1040 switch (tx_type) { | 1034 switch (tx_type) { |
| 1041 case 0: // DCT_DCT | 1035 case 0: // DCT_DCT |
| 1042 fdct8_1d_sse2(in); | 1036 fdct8_1d_sse2(in); |
| 1043 fdct8_1d_sse2(in); | 1037 fdct8_1d_sse2(in); |
| 1044 break; | 1038 break; |
| 1045 case 1: // ADST_DCT | 1039 case 1: // ADST_DCT |
| 1046 fadst8_1d_sse2(in); | 1040 fadst8_1d_sse2(in); |
| 1047 fdct8_1d_sse2(in); | 1041 fdct8_1d_sse2(in); |
| 1048 break; | 1042 break; |
| 1049 case 2: // DCT_ADST | 1043 case 2: // DCT_ADST |
| 1050 fdct8_1d_sse2(in); | 1044 fdct8_1d_sse2(in); |
| 1051 fadst8_1d_sse2(in); | 1045 fadst8_1d_sse2(in); |
| 1052 break; | 1046 break; |
| 1053 case 3: // ADST_ADST | 1047 case 3: // ADST_ADST |
| 1054 fadst8_1d_sse2(in); | 1048 fadst8_1d_sse2(in); |
| 1055 fadst8_1d_sse2(in); | 1049 fadst8_1d_sse2(in); |
| 1056 break; | 1050 break; |
| 1057 default: | 1051 default: |
| 1058 assert(0); | 1052 assert(0); |
| 1059 break; | 1053 break; |
| 1060 } | 1054 } |
| 1061 right_shift_8x8(in, 1); | 1055 right_shift_8x8(in, 1); |
| 1062 write_buffer_8x8(output, in, 8); | 1056 write_buffer_8x8(output, in, 8); |
| 1063 } | 1057 } |
| 1064 | 1058 |
| 1065 void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { | 1059 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { |
| 1066 // The 2D transform is done with two passes which are actually pretty | 1060 // The 2D transform is done with two passes which are actually pretty |
| 1067 // similar. In the first one, we transform the columns and transpose | 1061 // similar. In the first one, we transform the columns and transpose |
| 1068 // the results. In the second one, we transform the rows. To achieve that, | 1062 // the results. In the second one, we transform the rows. To achieve that, |
| 1069 // as the first pass results are transposed, we tranpose the columns (that | 1063 // as the first pass results are transposed, we tranpose the columns (that |
| 1070 // is the transposed rows) and transpose the results (so that it goes back | 1064 // is the transposed rows) and transpose the results (so that it goes back |
| 1071 // in normal/row positions). | 1065 // in normal/row positions). |
| 1072 const int stride = pitch >> 1; | |
| 1073 int pass; | 1066 int pass; |
| 1074 // We need an intermediate buffer between passes. | 1067 // We need an intermediate buffer between passes. |
| 1075 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); | 1068 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); |
| 1076 int16_t *in = input; | 1069 const int16_t *in = input; |
| 1077 int16_t *out = intermediate; | 1070 int16_t *out = intermediate; |
| 1078 // Constants | 1071 // Constants |
| 1079 // When we use them, in one case, they are all the same. In all others | 1072 // When we use them, in one case, they are all the same. In all others |
| 1080 // it's a pair of them that we need to repeat four times. This is done | 1073 // it's a pair of them that we need to repeat four times. This is done |
| 1081 // by constructing the 32 bit constant corresponding to that pair. | 1074 // by constructing the 32 bit constant corresponding to that pair. |
| 1082 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); | 1075 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
| 1083 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 1076 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 1084 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); | 1077 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
| 1085 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 1078 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
| 1086 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 1079 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
| (...skipping 594 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1681 _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); | 1674 _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); |
| 1682 } | 1675 } |
| 1683 out += 8*16; | 1676 out += 8*16; |
| 1684 } | 1677 } |
| 1685 // Setup in/out for next pass. | 1678 // Setup in/out for next pass. |
| 1686 in = intermediate; | 1679 in = intermediate; |
| 1687 out = output; | 1680 out = output; |
| 1688 } | 1681 } |
| 1689 } | 1682 } |
| 1690 | 1683 |
| 1691 static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0, | 1684 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, |
| 1692 __m128i *in1, int stride) { | 1685 __m128i *in1, int stride) { |
| 1693 // load first 8 columns | 1686 // load first 8 columns |
| 1694 load_buffer_8x8(input, in0, stride); | 1687 load_buffer_8x8(input, in0, stride); |
| 1695 load_buffer_8x8(input + 8 * stride, in0 + 8, stride); | 1688 load_buffer_8x8(input + 8 * stride, in0 + 8, stride); |
| 1696 | 1689 |
| 1697 input += 8; | 1690 input += 8; |
| 1698 // load second 8 columns | 1691 // load second 8 columns |
| 1699 load_buffer_8x8(input, in1, stride); | 1692 load_buffer_8x8(input, in1, stride); |
| 1700 load_buffer_8x8(input + 8 * stride, in1 + 8, stride); | 1693 load_buffer_8x8(input + 8 * stride, in1 + 8, stride); |
| 1701 } | 1694 } |
| (...skipping 831 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2533 fdct16_1d_8col(in1); | 2526 fdct16_1d_8col(in1); |
| 2534 array_transpose_16x16(in0, in1); | 2527 array_transpose_16x16(in0, in1); |
| 2535 } | 2528 } |
| 2536 | 2529 |
| 2537 void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { | 2530 void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { |
| 2538 fadst16_1d_8col(in0); | 2531 fadst16_1d_8col(in0); |
| 2539 fadst16_1d_8col(in1); | 2532 fadst16_1d_8col(in1); |
| 2540 array_transpose_16x16(in0, in1); | 2533 array_transpose_16x16(in0, in1); |
| 2541 } | 2534 } |
| 2542 | 2535 |
| 2543 void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, | 2536 void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, |
| 2544 int stride, int tx_type) { | 2537 int stride, int tx_type) { |
| 2545 __m128i in0[16], in1[16]; | 2538 __m128i in0[16], in1[16]; |
| 2546 load_buffer_16x16(input, in0, in1, stride); | 2539 load_buffer_16x16(input, in0, in1, stride); |
| 2547 switch (tx_type) { | 2540 switch (tx_type) { |
| 2548 case 0: // DCT_DCT | 2541 case 0: // DCT_DCT |
| 2549 fdct16_1d_sse2(in0, in1); | 2542 fdct16_1d_sse2(in0, in1); |
| 2550 right_shift_16x16(in0, in1); | 2543 right_shift_16x16(in0, in1); |
| 2551 fdct16_1d_sse2(in0, in1); | 2544 fdct16_1d_sse2(in0, in1); |
| 2552 break; | 2545 break; |
| 2553 case 1: // ADST_DCT | 2546 case 1: // ADST_DCT |
| (...skipping 11 matching lines...) Expand all Loading... |
| 2565 right_shift_16x16(in0, in1); | 2558 right_shift_16x16(in0, in1); |
| 2566 fadst16_1d_sse2(in0, in1); | 2559 fadst16_1d_sse2(in0, in1); |
| 2567 break; | 2560 break; |
| 2568 default: | 2561 default: |
| 2569 assert(0); | 2562 assert(0); |
| 2570 break; | 2563 break; |
| 2571 } | 2564 } |
| 2572 write_buffer_16x16(output, in0, in1, 16); | 2565 write_buffer_16x16(output, in0, in1, 16); |
| 2573 } | 2566 } |
| 2574 | 2567 |
| 2575 #define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2 | 2568 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 |
| 2576 #define FDCT32x32_HIGH_PRECISION 0 | 2569 #define FDCT32x32_HIGH_PRECISION 0 |
| 2577 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" | 2570 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" |
| 2578 #undef FDCT32x32_2D | 2571 #undef FDCT32x32_2D |
| 2579 #undef FDCT32x32_HIGH_PRECISION | 2572 #undef FDCT32x32_HIGH_PRECISION |
| 2580 | 2573 |
| 2581 #define FDCT32x32_2D vp9_short_fdct32x32_sse2 | 2574 #define FDCT32x32_2D vp9_fdct32x32_sse2 |
| 2582 #define FDCT32x32_HIGH_PRECISION 1 | 2575 #define FDCT32x32_HIGH_PRECISION 1 |
| 2583 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT | 2576 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT |
| 2584 #undef FDCT32x32_2D | 2577 #undef FDCT32x32_2D |
| 2585 #undef FDCT32x32_HIGH_PRECISION | 2578 #undef FDCT32x32_HIGH_PRECISION |
| OLD | NEW |