source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c - Issue 756673003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <immintrin.h> // AVX2	11 #include <immintrin.h> // AVX2

12 #include "vp9/common/vp9_idct.h" // for cospi constants	12 #include "vp9/common/vp9_idct.h" // for cospi constants

13 #include "vpx_ports/mem.h"	13 #include "vpx_ports/mem.h"

14	14

15 #define pair256_set_epi16(a, b) \	15 #define pair256_set_epi16(a, b) \

16 _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a)	16 _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \

	17 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \

	18 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \

	19 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))

17	20

18 #define pair256_set_epi32(a, b) \	21 #define pair256_set_epi32(a, b) \

19 _mm256_set_epi32(b, a, b, a, b, a, b, a)	22 _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), \

20	23 (int)(b), (int)(a), (int)(b), (int)(a))

21

22

23	24

24 #if FDCT32x32_HIGH_PRECISION	25 #if FDCT32x32_HIGH_PRECISION

25 static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {	26 static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {

26 __m256i buf0, buf1;	27 __m256i buf0, buf1;

27 buf0 = _mm256_mul_epu32(a, b);	28 buf0 = _mm256_mul_epu32(a, b);

28 a = _mm256_srli_epi64(a, 32);	29 a = _mm256_srli_epi64(a, 32);

29 b = _mm256_srli_epi64(b, 32);	30 b = _mm256_srli_epi64(b, 32);

30 buf1 = _mm256_mul_epu32(a, b);	31 buf1 = _mm256_mul_epu32(a, b);

31 return _mm256_add_epi64(buf0, buf1);	32 return _mm256_add_epi64(buf0, buf1);

32 }	33 }

(...skipping 10 matching lines...) Expand all Loading...
43 // Calculate pre-multiplied strides	44 // Calculate pre-multiplied strides

44 const int str1 = stride;	45 const int str1 = stride;

45 const int str2 = 2 * stride;	46 const int str2 = 2 * stride;

46 const int str3 = 2 * stride + str1;	47 const int str3 = 2 * stride + str1;

47 // We need an intermediate buffer between passes.	48 // We need an intermediate buffer between passes.

48 DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);	49 DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);

49 // Constants	50 // Constants

50 // When we use them, in one case, they are all the same. In all others	51 // When we use them, in one case, they are all the same. In all others

51 // it's a pair of them that we need to repeat four times. This is done	52 // it's a pair of them that we need to repeat four times. This is done

52 // by constructing the 32 bit constant corresponding to that pair.	53 // by constructing the 32 bit constant corresponding to that pair.

53 const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64);	54 const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);

54 const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64) ;	55 const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64) ;

55 const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64) ;	56 const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64) ;

56 const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);	57 const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);

57 const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);	58 const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);

58 const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64) ;	59 const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64) ;

59 const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64) ;	60 const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64) ;

60 const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64) ;	61 const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64) ;

61 const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);	62 const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);

62 const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);	63 const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);

63 const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64) ;	64 const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64) ;

(...skipping 2637 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2701 _mm_storeu_si128((__m128i )(output_nextStep + 6 32), _mm256_extract f128_si256(tr2_6,1));	2702 _mm_storeu_si128((__m128i )(output_nextStep + 6 32), _mm256_extract f128_si256(tr2_6,1));

2702 _mm_storeu_si128((__m128i )(output_nextStep + 7 32), _mm256_extract f128_si256(tr2_7,1));	2703 _mm_storeu_si128((__m128i )(output_nextStep + 7 32), _mm256_extract f128_si256(tr2_7,1));

2703 // Process next 8x8	2704 // Process next 8x8

2704 output_currStep += 8;	2705 output_currStep += 8;

2705 output_nextStep += 8;	2706 output_nextStep += 8;

2706 }	2707 }

2707 }	2708 }

2708 }	2709 }

2709 }	2710 }

2710 } // NOLINT	2711 } // NOLINT

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/vp9_variance.c ('k') | source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c » ('j') | no next file with comments »