Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(89)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

Issue 232133009: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 14 matching lines...) Expand all
25 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { 25 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
26 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 26 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
28 }; 28 };
29 29
30 DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 30 DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
31 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 31 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
33 }; 33 };
34 34
35 #if defined(__clang__)
36 # if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
37 (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0)
38 # define MM256_BROADCASTSI128_SI256(x) \
39 _mm_broadcastsi128_si256((__m128i const *)&(x))
40 # else // clang > 3.3, and not 5.0 on macosx.
41 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
42 # endif // clang <= 3.3
43 #elif defined(__GNUC__)
44 # if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
45 # define MM256_BROADCASTSI128_SI256(x) \
46 _mm_broadcastsi128_si256((__m128i const *)&(x))
47 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
48 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
49 # else // gcc > 4.7
50 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
51 # endif // gcc <= 4.6
52 #else // !(gcc || clang)
53 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
54 #endif // __clang__
55
35 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, 56 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
36 unsigned int src_pixels_per_line, 57 unsigned int src_pixels_per_line,
37 unsigned char *output_ptr, 58 unsigned char *output_ptr,
38 unsigned int output_pitch, 59 unsigned int output_pitch,
39 unsigned int output_height, 60 unsigned int output_height,
40 int16_t *filter) { 61 int16_t *filter) {
41 __m128i filtersReg; 62 __m128i filtersReg;
42 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; 63 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
43 __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 64 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
44 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; 65 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
45 __m256i srcReg32b1, srcReg32b2, filtersReg32; 66 __m256i srcReg32b1, srcReg32b2, filtersReg32;
46 unsigned int i; 67 unsigned int i;
47 unsigned int src_stride, dst_stride; 68 unsigned int src_stride, dst_stride;
48 69
49 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 70 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
50 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); 71 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
51 filtersReg = _mm_loadu_si128((__m128i *)filter); 72 filtersReg = _mm_loadu_si128((__m128i *)filter);
52 // converting the 16 bit (short) to 8 bit (byte) and have the same data 73 // converting the 16 bit (short) to 8 bit (byte) and have the same data
53 // in both lanes of 128 bit register. 74 // in both lanes of 128 bit register.
54 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 75 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
55 // have the same data in both lanes of a 256 bit register 76 // have the same data in both lanes of a 256 bit register
56 #if defined (__GNUC__) 77 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
57 #if ( __GNUC__ < 4 || (__GNUC__ == 4 && \
58 (__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))
59 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);
60 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))
61 filtersReg32 = _mm_broadcastsi128_si256(filtersReg);
62 #else
63 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
64 #endif
65 #else
66 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
67 #endif
68 78
69 // duplicate only the first 16 bits (first and second byte) 79 // duplicate only the first 16 bits (first and second byte)
70 // across 256 bit register 80 // across 256 bit register
71 firstFilters = _mm256_shuffle_epi8(filtersReg32, 81 firstFilters = _mm256_shuffle_epi8(filtersReg32,
72 _mm256_set1_epi16(0x100u)); 82 _mm256_set1_epi16(0x100u));
73 // duplicate only the second 16 bits (third and forth byte) 83 // duplicate only the second 16 bits (third and forth byte)
74 // across 256 bit register 84 // across 256 bit register
75 secondFilters = _mm256_shuffle_epi8(filtersReg32, 85 secondFilters = _mm256_shuffle_epi8(filtersReg32,
76 _mm256_set1_epi16(0x302u)); 86 _mm256_set1_epi16(0x302u));
77 // duplicate only the third 16 bits (fifth and sixth byte) 87 // duplicate only the third 16 bits (fifth and sixth byte)
(...skipping 224 matching lines...) Expand 10 before | Expand all | Expand 10 after
302 unsigned int i; 312 unsigned int i;
303 unsigned int src_stride, dst_stride; 313 unsigned int src_stride, dst_stride;
304 314
305 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
306 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); 316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
307 filtersReg = _mm_loadu_si128((__m128i *)filter); 317 filtersReg = _mm_loadu_si128((__m128i *)filter);
308 // converting the 16 bit (short) to 8 bit (byte) and have the 318 // converting the 16 bit (short) to 8 bit (byte) and have the
309 // same data in both lanes of 128 bit register. 319 // same data in both lanes of 128 bit register.
310 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
311 // have the same data in both lanes of a 256 bit register 321 // have the same data in both lanes of a 256 bit register
312 #if defined (__GNUC__) 322 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
313 #if ( __GNUC__ < 4 || (__GNUC__ == 4 && \
314 (__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))
315 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);
316 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))
317 filtersReg32 = _mm_broadcastsi128_si256(filtersReg);
318 #else
319 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
320 #endif
321 #else
322 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
323 #endif
324 323
325 // duplicate only the first 16 bits (first and second byte) 324 // duplicate only the first 16 bits (first and second byte)
326 // across 256 bit register 325 // across 256 bit register
327 firstFilters = _mm256_shuffle_epi8(filtersReg32, 326 firstFilters = _mm256_shuffle_epi8(filtersReg32,
328 _mm256_set1_epi16(0x100u)); 327 _mm256_set1_epi16(0x100u));
329 // duplicate only the second 16 bits (third and forth byte) 328 // duplicate only the second 16 bits (third and forth byte)
330 // across 256 bit register 329 // across 256 bit register
331 secondFilters = _mm256_shuffle_epi8(filtersReg32, 330 secondFilters = _mm256_shuffle_epi8(filtersReg32,
332 _mm256_set1_epi16(0x302u)); 331 _mm256_set1_epi16(0x302u));
333 // duplicate only the third 16 bits (fifth and sixth byte) 332 // duplicate only the third 16 bits (fifth and sixth byte)
(...skipping 202 matching lines...) Expand 10 before | Expand all | Expand 10 after
536 535
537 // shrink to 8 bit each 16 bits, the first lane contain the first 536 // shrink to 8 bit each 16 bits, the first lane contain the first
538 // convolve result and the second lane contain the second convolve 537 // convolve result and the second lane contain the second convolve
539 // result 538 // result
540 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
541 540
542 // save 16 bytes 541 // save 16 bytes
543 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
544 } 543 }
545 } 544 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_copy_sse2.asm ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698