OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <emmintrin.h> // SSE2 | 11 #include <emmintrin.h> // SSE2 |
12 #include "vp9/common/vp9_idct.h" // for cospi constants | 12 #include "vp9/common/vp9_idct.h" // for cospi constants |
13 #include "vpx_ports/mem.h" | 13 #include "vpx_ports/mem.h" |
14 | 14 |
15 #define pair_set_epi32(a, b) \ | 15 #define pair_set_epi32(a, b) \ |
16 _mm_set_epi32(b, a, b, a) | 16 _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) |
17 | 17 |
18 #if FDCT32x32_HIGH_PRECISION | 18 #if FDCT32x32_HIGH_PRECISION |
19 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { | 19 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { |
20 __m128i buf0, buf1; | 20 __m128i buf0, buf1; |
21 buf0 = _mm_mul_epu32(a, b); | 21 buf0 = _mm_mul_epu32(a, b); |
22 a = _mm_srli_epi64(a, 32); | 22 a = _mm_srli_epi64(a, 32); |
23 b = _mm_srli_epi64(b, 32); | 23 b = _mm_srli_epi64(b, 32); |
24 buf1 = _mm_mul_epu32(a, b); | 24 buf1 = _mm_mul_epu32(a, b); |
25 return _mm_add_epi64(buf0, buf1); | 25 return _mm_add_epi64(buf0, buf1); |
26 } | 26 } |
(...skipping 10 matching lines...) Expand all Loading... |
37 // Calculate pre-multiplied strides | 37 // Calculate pre-multiplied strides |
38 const int str1 = stride; | 38 const int str1 = stride; |
39 const int str2 = 2 * stride; | 39 const int str2 = 2 * stride; |
40 const int str3 = 2 * stride + str1; | 40 const int str3 = 2 * stride + str1; |
41 // We need an intermediate buffer between passes. | 41 // We need an intermediate buffer between passes. |
42 DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); | 42 DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); |
43 // Constants | 43 // Constants |
44 // When we use them, in one case, they are all the same. In all others | 44 // When we use them, in one case, they are all the same. In all others |
45 // it's a pair of them that we need to repeat four times. This is done | 45 // it's a pair of them that we need to repeat four times. This is done |
46 // by constructing the 32 bit constant corresponding to that pair. | 46 // by constructing the 32 bit constant corresponding to that pair. |
47 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64); | 47 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); |
48 const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); | 48 const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); |
49 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 49 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
50 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 50 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
51 const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); | 51 const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); |
52 const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); | 52 const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); |
53 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 53 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
54 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 54 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
55 const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); | 55 const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); |
56 const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); | 56 const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); |
57 const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); | 57 const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); |
(...skipping 2624 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2682 _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); | 2682 _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); |
2683 _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); | 2683 _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); |
2684 _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); | 2684 _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); |
2685 // Process next 8x8 | 2685 // Process next 8x8 |
2686 output += 8; | 2686 output += 8; |
2687 } | 2687 } |
2688 } | 2688 } |
2689 } | 2689 } |
2690 } | 2690 } |
2691 } // NOLINT | 2691 } // NOLINT |
OLD | NEW |