source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c - Issue 232133009: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

Issue 232133009: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 14 matching lines...) Expand all Loading...
25 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {	25 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {

26 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,	26 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,

27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12	27 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12

28 };	28 };

29	29

30 DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {	30 DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {

31 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,	31 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,

32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14	32 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14

33 };	33 };

34	34

	35 #if defined(__clang__)

	36 # if __clang_major__ < 3 \|\| (__clang_major__ == 3 && __clang_minor__ <= 3) \|\| \

	37 (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0)

	38 # define MM256_BROADCASTSI128_SI256(x) \

	39 _mm_broadcastsi128_si256((__m128i const *)&(x))

	40 # else // clang > 3.3, and not 5.0 on macosx.

	41 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)

	42 # endif // clang <= 3.3

	43 #elif defined(__GNUC__)

	44 # if __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)

	45 # define MM256_BROADCASTSI128_SI256(x) \

	46 _mm_broadcastsi128_si256((__m128i const *)&(x))

	47 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7

	48 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)

	49 # else // gcc > 4.7

	50 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)

	51 # endif // gcc <= 4.6

	52 #else // !(gcc \|\| clang)

	53 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)

	54 #endif // __clang__

	55

35 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,	56 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,

36 unsigned int src_pixels_per_line,	57 unsigned int src_pixels_per_line,

37 unsigned char *output_ptr,	58 unsigned char *output_ptr,

38 unsigned int output_pitch,	59 unsigned int output_pitch,

39 unsigned int output_height,	60 unsigned int output_height,

40 int16_t *filter) {	61 int16_t *filter) {

41 __m128i filtersReg;	62 __m128i filtersReg;

42 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;	63 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;

43 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;	64 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;

44 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;	65 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;

45 __m256i srcReg32b1, srcReg32b2, filtersReg32;	66 __m256i srcReg32b1, srcReg32b2, filtersReg32;

46 unsigned int i;	67 unsigned int i;

47 unsigned int src_stride, dst_stride;	68 unsigned int src_stride, dst_stride;

48	69

49 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	70 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

50 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);	71 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);

51 filtersReg = _mm_loadu_si128((__m128i *)filter);	72 filtersReg = _mm_loadu_si128((__m128i *)filter);

52 // converting the 16 bit (short) to 8 bit (byte) and have the same data	73 // converting the 16 bit (short) to 8 bit (byte) and have the same data

53 // in both lanes of 128 bit register.	74 // in both lanes of 128 bit register.

54 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	75 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

55 // have the same data in both lanes of a 256 bit register	76 // have the same data in both lanes of a 256 bit register

56 #if defined (__GNUC__)	77 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

57 #if ( __GNUC__ < 4 \|\| (__GNUC__ == 4 && \

58 (__GNUC_MINOR__ < 6 \|\| (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))

59 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);

60 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))

61 filtersReg32 = _mm_broadcastsi128_si256(filtersReg);

62 #else

63 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);

64 #endif

65 #else

66 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);

67 #endif

68	78

69 // duplicate only the first 16 bits (first and second byte)	79 // duplicate only the first 16 bits (first and second byte)

70 // across 256 bit register	80 // across 256 bit register

71 firstFilters = _mm256_shuffle_epi8(filtersReg32,	81 firstFilters = _mm256_shuffle_epi8(filtersReg32,

72 _mm256_set1_epi16(0x100u));	82 _mm256_set1_epi16(0x100u));

73 // duplicate only the second 16 bits (third and forth byte)	83 // duplicate only the second 16 bits (third and forth byte)

74 // across 256 bit register	84 // across 256 bit register

75 secondFilters = _mm256_shuffle_epi8(filtersReg32,	85 secondFilters = _mm256_shuffle_epi8(filtersReg32,

76 _mm256_set1_epi16(0x302u));	86 _mm256_set1_epi16(0x302u));

77 // duplicate only the third 16 bits (fifth and sixth byte)	87 // duplicate only the third 16 bits (fifth and sixth byte)

(...skipping 224 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
302 unsigned int i;	312 unsigned int i;

303 unsigned int src_stride, dst_stride;	313 unsigned int src_stride, dst_stride;

304	314

305 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

306 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);	316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);

307 filtersReg = _mm_loadu_si128((__m128i *)filter);	317 filtersReg = _mm_loadu_si128((__m128i *)filter);

308 // converting the 16 bit (short) to 8 bit (byte) and have the	318 // converting the 16 bit (short) to 8 bit (byte) and have the

309 // same data in both lanes of 128 bit register.	319 // same data in both lanes of 128 bit register.

310 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

311 // have the same data in both lanes of a 256 bit register	321 // have the same data in both lanes of a 256 bit register

312 #if defined (__GNUC__)	322 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

313 #if ( __GNUC__ < 4 \|\| (__GNUC__ == 4 && \

314 (__GNUC_MINOR__ < 6 \|\| (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))

315 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);

316 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))

317 filtersReg32 = _mm_broadcastsi128_si256(filtersReg);

318 #else

319 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);

320 #endif

321 #else

322 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);

323 #endif

324	323

325 // duplicate only the first 16 bits (first and second byte)	324 // duplicate only the first 16 bits (first and second byte)

326 // across 256 bit register	325 // across 256 bit register

327 firstFilters = _mm256_shuffle_epi8(filtersReg32,	326 firstFilters = _mm256_shuffle_epi8(filtersReg32,

328 _mm256_set1_epi16(0x100u));	327 _mm256_set1_epi16(0x100u));

329 // duplicate only the second 16 bits (third and forth byte)	328 // duplicate only the second 16 bits (third and forth byte)

330 // across 256 bit register	329 // across 256 bit register

331 secondFilters = _mm256_shuffle_epi8(filtersReg32,	330 secondFilters = _mm256_shuffle_epi8(filtersReg32,

332 _mm256_set1_epi16(0x302u));	331 _mm256_set1_epi16(0x302u));

333 // duplicate only the third 16 bits (fifth and sixth byte)	332 // duplicate only the third 16 bits (fifth and sixth byte)

(...skipping 202 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
536	535

537 // shrink to 8 bit each 16 bits, the first lane contain the first	536 // shrink to 8 bit each 16 bits, the first lane contain the first

538 // convolve result and the second lane contain the second convolve	537 // convolve result and the second lane contain the second convolve

539 // result	538 // result

540 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);	539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);

541	540

542 // save 16 bytes	541 // save 16 bytes

543 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);	542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);

544 }	543 }

545 }	544 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_copy_sse2.asm ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.h » ('j') | no next file with comments »