OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <immintrin.h> // AVX2 | 11 #include <immintrin.h> // AVX2 |
12 #include "vp9/common/vp9_idct.h" // for cospi constants | 12 #include "vp9/common/vp9_idct.h" // for cospi constants |
13 #include "vpx_ports/mem.h" | 13 #include "vpx_ports/mem.h" |
14 | 14 |
15 #define pair256_set_epi16(a, b) \ | 15 #define pair256_set_epi16(a, b) \ |
16 _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a) | 16 _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ |
| 17 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ |
| 18 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ |
| 19 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) |
17 | 20 |
18 #define pair256_set_epi32(a, b) \ | 21 #define pair256_set_epi32(a, b) \ |
19 _mm256_set_epi32(b, a, b, a, b, a, b, a) | 22 _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), \ |
20 | 23 (int)(b), (int)(a), (int)(b), (int)(a)) |
21 | |
22 | |
23 | 24 |
24 #if FDCT32x32_HIGH_PRECISION | 25 #if FDCT32x32_HIGH_PRECISION |
25 static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) { | 26 static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) { |
26 __m256i buf0, buf1; | 27 __m256i buf0, buf1; |
27 buf0 = _mm256_mul_epu32(a, b); | 28 buf0 = _mm256_mul_epu32(a, b); |
28 a = _mm256_srli_epi64(a, 32); | 29 a = _mm256_srli_epi64(a, 32); |
29 b = _mm256_srli_epi64(b, 32); | 30 b = _mm256_srli_epi64(b, 32); |
30 buf1 = _mm256_mul_epu32(a, b); | 31 buf1 = _mm256_mul_epu32(a, b); |
31 return _mm256_add_epi64(buf0, buf1); | 32 return _mm256_add_epi64(buf0, buf1); |
32 } | 33 } |
(...skipping 10 matching lines...) Expand all Loading... |
43 // Calculate pre-multiplied strides | 44 // Calculate pre-multiplied strides |
44 const int str1 = stride; | 45 const int str1 = stride; |
45 const int str2 = 2 * stride; | 46 const int str2 = 2 * stride; |
46 const int str3 = 2 * stride + str1; | 47 const int str3 = 2 * stride + str1; |
47 // We need an intermediate buffer between passes. | 48 // We need an intermediate buffer between passes. |
48 DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]); | 49 DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]); |
49 // Constants | 50 // Constants |
50 // When we use them, in one case, they are all the same. In all others | 51 // When we use them, in one case, they are all the same. In all others |
51 // it's a pair of them that we need to repeat four times. This is done | 52 // it's a pair of them that we need to repeat four times. This is done |
52 // by constructing the 32 bit constant corresponding to that pair. | 53 // by constructing the 32 bit constant corresponding to that pair. |
53 const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64); | 54 const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); |
54 const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64)
; | 55 const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64)
; |
55 const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64)
; | 56 const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64)
; |
56 const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); | 57 const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); |
57 const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); | 58 const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); |
58 const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64)
; | 59 const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64)
; |
59 const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64)
; | 60 const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64)
; |
60 const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64)
; | 61 const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64)
; |
61 const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); | 62 const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); |
62 const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); | 63 const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); |
63 const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64)
; | 64 const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64)
; |
(...skipping 2637 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2701 _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extract
f128_si256(tr2_6,1)); | 2702 _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extract
f128_si256(tr2_6,1)); |
2702 _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extract
f128_si256(tr2_7,1)); | 2703 _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extract
f128_si256(tr2_7,1)); |
2703 // Process next 8x8 | 2704 // Process next 8x8 |
2704 output_currStep += 8; | 2705 output_currStep += 8; |
2705 output_nextStep += 8; | 2706 output_nextStep += 8; |
2706 } | 2707 } |
2707 } | 2708 } |
2708 } | 2709 } |
2709 } | 2710 } |
2710 } // NOLINT | 2711 } // NOLINT |
OLD | NEW |