source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c - Issue 1124333011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: only update to last nights LKGR Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *

4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.

9 */

10

11 #include <arm_neon.h>

12 #include "./vp9_rtcd.h"

13 #include "./vpx_config.h"

14

15 #include "vpx/vpx_integer.h"

16

17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,

18 const uint16x8_t vec_hi) {

19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),

20 vget_high_u16(vec_lo));

21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),

22 vget_high_u16(vec_hi));

23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);

24 const uint64x2_t b = vpaddlq_u32(a);

25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),

26 vreinterpret_u32_u64(vget_high_u64(b)));

27 return vget_lane_u32(c, 0);

28 }

29 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {

30 const uint32x4_t a = vpaddlq_u16(vec_16x8);

31 const uint64x2_t b = vpaddlq_u32(a);

32 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),

33 vreinterpret_u32_u64(vget_high_u64(b)));

34 return vget_lane_u32(c, 0);

35 }

36

37 unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride,

38 const uint8_t *ref, int ref_stride) {

39 int i;

40 uint16x8_t vec_accum_lo = vdupq_n_u16(0);

41 uint16x8_t vec_accum_hi = vdupq_n_u16(0);

42 for (i = 0; i < 64; ++i) {

43 const uint8x16_t vec_src_00 = vld1q_u8(src);

44 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);

45 const uint8x16_t vec_src_32 = vld1q_u8(src + 32);

46 const uint8x16_t vec_src_48 = vld1q_u8(src + 48);

47 const uint8x16_t vec_ref_00 = vld1q_u8(ref);

48 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);

49 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);

50 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);

51 src += src_stride;

52 ref += ref_stride;

53 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),

54 vget_low_u8(vec_ref_00));

55 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),

56 vget_high_u8(vec_ref_00));

57 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),

58 vget_low_u8(vec_ref_16));

59 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),

60 vget_high_u8(vec_ref_16));

61 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),

62 vget_low_u8(vec_ref_32));

63 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),

64 vget_high_u8(vec_ref_32));

65 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),

66 vget_low_u8(vec_ref_48));

67 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),

68 vget_high_u8(vec_ref_48));

69 }

70 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);

71 }

72

73 unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,

74 const uint8_t *ref, int ref_stride) {

75 int i;

76 uint16x8_t vec_accum_lo = vdupq_n_u16(0);

77 uint16x8_t vec_accum_hi = vdupq_n_u16(0);

78

79 for (i = 0; i < 32; ++i) {

80 const uint8x16_t vec_src_00 = vld1q_u8(src);

81 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);

82 const uint8x16_t vec_ref_00 = vld1q_u8(ref);

83 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);

84 src += src_stride;

85 ref += ref_stride;

86 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),

87 vget_low_u8(vec_ref_00));

88 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),

89 vget_high_u8(vec_ref_00));

90 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),

91 vget_low_u8(vec_ref_16));

92 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),

93 vget_high_u8(vec_ref_16));

94 }

95 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));

96 }

97

98 unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,

99 const uint8_t *ref, int ref_stride) {

100 int i;

101 uint16x8_t vec_accum_lo = vdupq_n_u16(0);

102 uint16x8_t vec_accum_hi = vdupq_n_u16(0);

103

104 for (i = 0; i < 16; ++i) {

105 const uint8x16_t vec_src = vld1q_u8(src);

106 const uint8x16_t vec_ref = vld1q_u8(ref);

107 src += src_stride;

108 ref += ref_stride;

109 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),

110 vget_low_u8(vec_ref));

111 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),

112 vget_high_u8(vec_ref));

113 }

114 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));

115 }

116

117 unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride,

118 const uint8_t *ref, int ref_stride) {

119 int i;

120 uint16x8_t vec_accum = vdupq_n_u16(0);

121

122 for (i = 0; i < 8; ++i) {

123 const uint8x8_t vec_src = vld1_u8(src);

124 const uint8x8_t vec_ref = vld1_u8(ref);

125 src += src_stride;

126 ref += ref_stride;

127 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);

128 }

129 return horizontal_add_16x8(vec_accum);

130 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/arm/neon/vp9_sad4d_neon.c ('k') | source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c » ('j') | no next file with comments »