Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(335)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <immintrin.h> // AVX2 11 #include <immintrin.h> // AVX2
12 #include "vp9/common/vp9_idct.h" // for cospi constants 12 #include "vp9/common/vp9_idct.h" // for cospi constants
13 #include "vpx_ports/mem.h" 13 #include "vpx_ports/mem.h"
14 14
15 #define pair256_set_epi16(a, b) \ 15 #define pair256_set_epi16(a, b) \
16 _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a) 16 _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
17 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
18 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
19 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
17 20
18 #define pair256_set_epi32(a, b) \ 21 #define pair256_set_epi32(a, b) \
19 _mm256_set_epi32(b, a, b, a, b, a, b, a) 22 _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), \
20 23 (int)(b), (int)(a), (int)(b), (int)(a))
21
22
23 24
24 #if FDCT32x32_HIGH_PRECISION 25 #if FDCT32x32_HIGH_PRECISION
25 static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) { 26 static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
26 __m256i buf0, buf1; 27 __m256i buf0, buf1;
27 buf0 = _mm256_mul_epu32(a, b); 28 buf0 = _mm256_mul_epu32(a, b);
28 a = _mm256_srli_epi64(a, 32); 29 a = _mm256_srli_epi64(a, 32);
29 b = _mm256_srli_epi64(b, 32); 30 b = _mm256_srli_epi64(b, 32);
30 buf1 = _mm256_mul_epu32(a, b); 31 buf1 = _mm256_mul_epu32(a, b);
31 return _mm256_add_epi64(buf0, buf1); 32 return _mm256_add_epi64(buf0, buf1);
32 } 33 }
(...skipping 10 matching lines...) Expand all
43 // Calculate pre-multiplied strides 44 // Calculate pre-multiplied strides
44 const int str1 = stride; 45 const int str1 = stride;
45 const int str2 = 2 * stride; 46 const int str2 = 2 * stride;
46 const int str3 = 2 * stride + str1; 47 const int str3 = 2 * stride + str1;
47 // We need an intermediate buffer between passes. 48 // We need an intermediate buffer between passes.
48 DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]); 49 DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
49 // Constants 50 // Constants
50 // When we use them, in one case, they are all the same. In all others 51 // When we use them, in one case, they are all the same. In all others
51 // it's a pair of them that we need to repeat four times. This is done 52 // it's a pair of them that we need to repeat four times. This is done
52 // by constructing the 32 bit constant corresponding to that pair. 53 // by constructing the 32 bit constant corresponding to that pair.
53 const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64); 54 const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
54 const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64) ; 55 const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64) ;
55 const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64) ; 56 const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64) ;
56 const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); 57 const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
57 const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); 58 const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
58 const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64) ; 59 const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64) ;
59 const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64) ; 60 const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64) ;
60 const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64) ; 61 const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64) ;
61 const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); 62 const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
62 const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); 63 const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
63 const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64) ; 64 const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64) ;
(...skipping 2637 matching lines...) Expand 10 before | Expand all | Expand 10 after
2701 _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extract f128_si256(tr2_6,1)); 2702 _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extract f128_si256(tr2_6,1));
2702 _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extract f128_si256(tr2_7,1)); 2703 _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extract f128_si256(tr2_7,1));
2703 // Process next 8x8 2704 // Process next 8x8
2704 output_currStep += 8; 2705 output_currStep += 8;
2705 output_nextStep += 8; 2706 output_nextStep += 8;
2706 } 2707 }
2707 } 2708 }
2708 } 2709 }
2709 } 2710 }
2710 } // NOLINT 2711 } // NOLINT
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_variance.c ('k') | source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698