source/libvpx/vpx_dsp/arm/sad_neon.c - Issue 1124333011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vpx_dsp/arm/sad_neon.c

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: only update to last nights LKGR Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <arm_neon.h>	11 #include <arm_neon.h>

12 #include "./vp9_rtcd.h"	12

13 #include "./vpx_config.h"	13 #include "./vpx_config.h"

14	14

15 #include "vpx/vpx_integer.h"	15 #include "vpx/vpx_integer.h"

16	16

	17 unsigned int vpx_sad8x16_neon(

	18 unsigned char *src_ptr,

	19 int src_stride,

	20 unsigned char *ref_ptr,

	21 int ref_stride) {

	22 uint8x8_t d0, d8;

	23 uint16x8_t q12;

	24 uint32x4_t q1;

	25 uint64x2_t q3;

	26 uint32x2_t d5;

	27 int i;

	28

	29 d0 = vld1_u8(src_ptr);

	30 src_ptr += src_stride;

	31 d8 = vld1_u8(ref_ptr);

	32 ref_ptr += ref_stride;

	33 q12 = vabdl_u8(d0, d8);

	34

	35 for (i = 0; i < 15; i++) {

	36 d0 = vld1_u8(src_ptr);

	37 src_ptr += src_stride;

	38 d8 = vld1_u8(ref_ptr);

	39 ref_ptr += ref_stride;

	40 q12 = vabal_u8(q12, d0, d8);

	41 }

	42

	43 q1 = vpaddlq_u16(q12);

	44 q3 = vpaddlq_u32(q1);

	45 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),

	46 vreinterpret_u32_u64(vget_high_u64(q3)));

	47

	48 return vget_lane_u32(d5, 0);

	49 }

	50

	51 unsigned int vpx_sad4x4_neon(

	52 unsigned char *src_ptr,

	53 int src_stride,

	54 unsigned char *ref_ptr,

	55 int ref_stride) {

	56 uint8x8_t d0, d8;

	57 uint16x8_t q12;

	58 uint32x2_t d1;

	59 uint64x1_t d3;

	60 int i;

	61

	62 d0 = vld1_u8(src_ptr);

	63 src_ptr += src_stride;

	64 d8 = vld1_u8(ref_ptr);

	65 ref_ptr += ref_stride;

	66 q12 = vabdl_u8(d0, d8);

	67

	68 for (i = 0; i < 3; i++) {

	69 d0 = vld1_u8(src_ptr);

	70 src_ptr += src_stride;

	71 d8 = vld1_u8(ref_ptr);

	72 ref_ptr += ref_stride;

	73 q12 = vabal_u8(q12, d0, d8);

	74 }

	75

	76 d1 = vpaddl_u16(vget_low_u16(q12));

	77 d3 = vpaddl_u32(d1);

	78

	79 return vget_lane_u32(vreinterpret_u32_u64(d3), 0);

	80 }

	81

	82 unsigned int vpx_sad16x8_neon(

	83 unsigned char *src_ptr,

	84 int src_stride,

	85 unsigned char *ref_ptr,

	86 int ref_stride) {

	87 uint8x16_t q0, q4;

	88 uint16x8_t q12, q13;

	89 uint32x4_t q1;

	90 uint64x2_t q3;

	91 uint32x2_t d5;

	92 int i;

	93

	94 q0 = vld1q_u8(src_ptr);

	95 src_ptr += src_stride;

	96 q4 = vld1q_u8(ref_ptr);

	97 ref_ptr += ref_stride;

	98 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));

	99 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));

	100

	101 for (i = 0; i < 7; i++) {

	102 q0 = vld1q_u8(src_ptr);

	103 src_ptr += src_stride;

	104 q4 = vld1q_u8(ref_ptr);

	105 ref_ptr += ref_stride;

	106 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));

	107 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));

	108 }

	109

	110 q12 = vaddq_u16(q12, q13);

	111 q1 = vpaddlq_u16(q12);

	112 q3 = vpaddlq_u32(q1);

	113 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),

	114 vreinterpret_u32_u64(vget_high_u64(q3)));

	115

	116 return vget_lane_u32(d5, 0);

	117 }

	118

17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,	119 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,

18 const uint16x8_t vec_hi) {	120 const uint16x8_t vec_hi) {

19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),	121 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),

20 vget_high_u16(vec_lo));	122 vget_high_u16(vec_lo));

21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),	123 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),

22 vget_high_u16(vec_hi));	124 vget_high_u16(vec_hi));

23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);	125 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);

24 const uint64x2_t b = vpaddlq_u32(a);	126 const uint64x2_t b = vpaddlq_u32(a);

25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),	127 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),

26 vreinterpret_u32_u64(vget_high_u64(b)));	128 vreinterpret_u32_u64(vget_high_u64(b)));

27 return vget_lane_u32(c, 0);	129 return vget_lane_u32(c, 0);

28 }	130 }

29 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {	131 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {

30 const uint32x4_t a = vpaddlq_u16(vec_16x8);	132 const uint32x4_t a = vpaddlq_u16(vec_16x8);

31 const uint64x2_t b = vpaddlq_u32(a);	133 const uint64x2_t b = vpaddlq_u32(a);

32 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),	134 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),

33 vreinterpret_u32_u64(vget_high_u64(b)));	135 vreinterpret_u32_u64(vget_high_u64(b)));

34 return vget_lane_u32(c, 0);	136 return vget_lane_u32(c, 0);

35 }	137 }

36	138

37 unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride,	139 unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,

38 const uint8_t *ref, int ref_stride) {	140 const uint8_t *ref, int ref_stride) {

39 int i;	141 int i;

40 uint16x8_t vec_accum_lo = vdupq_n_u16(0);	142 uint16x8_t vec_accum_lo = vdupq_n_u16(0);

41 uint16x8_t vec_accum_hi = vdupq_n_u16(0);	143 uint16x8_t vec_accum_hi = vdupq_n_u16(0);

42 for (i = 0; i < 64; ++i) {	144 for (i = 0; i < 64; ++i) {

43 const uint8x16_t vec_src_00 = vld1q_u8(src);	145 const uint8x16_t vec_src_00 = vld1q_u8(src);

44 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);	146 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);

45 const uint8x16_t vec_src_32 = vld1q_u8(src + 32);	147 const uint8x16_t vec_src_32 = vld1q_u8(src + 32);

46 const uint8x16_t vec_src_48 = vld1q_u8(src + 48);	148 const uint8x16_t vec_src_48 = vld1q_u8(src + 48);

47 const uint8x16_t vec_ref_00 = vld1q_u8(ref);	149 const uint8x16_t vec_ref_00 = vld1q_u8(ref);

(...skipping 15 matching lines...) Expand all Loading...
63 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),	165 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),

64 vget_high_u8(vec_ref_32));	166 vget_high_u8(vec_ref_32));

65 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),	167 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),

66 vget_low_u8(vec_ref_48));	168 vget_low_u8(vec_ref_48));

67 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),	169 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),

68 vget_high_u8(vec_ref_48));	170 vget_high_u8(vec_ref_48));

69 }	171 }

70 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);	172 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);

71 }	173 }

72	174

73 unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,	175 unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,

74 const uint8_t *ref, int ref_stride) {	176 const uint8_t *ref, int ref_stride) {

75 int i;	177 int i;

76 uint16x8_t vec_accum_lo = vdupq_n_u16(0);	178 uint16x8_t vec_accum_lo = vdupq_n_u16(0);

77 uint16x8_t vec_accum_hi = vdupq_n_u16(0);	179 uint16x8_t vec_accum_hi = vdupq_n_u16(0);

78	180

79 for (i = 0; i < 32; ++i) {	181 for (i = 0; i < 32; ++i) {

80 const uint8x16_t vec_src_00 = vld1q_u8(src);	182 const uint8x16_t vec_src_00 = vld1q_u8(src);

81 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);	183 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);

82 const uint8x16_t vec_ref_00 = vld1q_u8(ref);	184 const uint8x16_t vec_ref_00 = vld1q_u8(ref);

83 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);	185 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);

84 src += src_stride;	186 src += src_stride;

85 ref += ref_stride;	187 ref += ref_stride;

86 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),	188 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),

87 vget_low_u8(vec_ref_00));	189 vget_low_u8(vec_ref_00));

88 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),	190 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),

89 vget_high_u8(vec_ref_00));	191 vget_high_u8(vec_ref_00));

90 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),	192 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),

91 vget_low_u8(vec_ref_16));	193 vget_low_u8(vec_ref_16));

92 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),	194 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),

93 vget_high_u8(vec_ref_16));	195 vget_high_u8(vec_ref_16));

94 }	196 }

95 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));	197 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));

96 }	198 }

97	199

98 unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,	200 unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,

99 const uint8_t *ref, int ref_stride) {	201 const uint8_t *ref, int ref_stride) {

100 int i;	202 int i;

101 uint16x8_t vec_accum_lo = vdupq_n_u16(0);	203 uint16x8_t vec_accum_lo = vdupq_n_u16(0);

102 uint16x8_t vec_accum_hi = vdupq_n_u16(0);	204 uint16x8_t vec_accum_hi = vdupq_n_u16(0);

103	205

104 for (i = 0; i < 16; ++i) {	206 for (i = 0; i < 16; ++i) {

105 const uint8x16_t vec_src = vld1q_u8(src);	207 const uint8x16_t vec_src = vld1q_u8(src);

106 const uint8x16_t vec_ref = vld1q_u8(ref);	208 const uint8x16_t vec_ref = vld1q_u8(ref);

107 src += src_stride;	209 src += src_stride;

108 ref += ref_stride;	210 ref += ref_stride;

109 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),	211 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),

110 vget_low_u8(vec_ref));	212 vget_low_u8(vec_ref));

111 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),	213 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),

112 vget_high_u8(vec_ref));	214 vget_high_u8(vec_ref));

113 }	215 }

114 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));	216 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));

115 }	217 }

116	218

117 unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride,	219 unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,

118 const uint8_t *ref, int ref_stride) {	220 const uint8_t *ref, int ref_stride) {

119 int i;	221 int i;

120 uint16x8_t vec_accum = vdupq_n_u16(0);	222 uint16x8_t vec_accum = vdupq_n_u16(0);

121	223

122 for (i = 0; i < 8; ++i) {	224 for (i = 0; i < 8; ++i) {

123 const uint8x8_t vec_src = vld1_u8(src);	225 const uint8x8_t vec_src = vld1_u8(src);

124 const uint8x8_t vec_ref = vld1_u8(ref);	226 const uint8x8_t vec_ref = vld1_u8(ref);

125 src += src_stride;	227 src += src_stride;

126 ref += ref_stride;	228 ref += ref_stride;

127 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);	229 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);

128 }	230 }

129 return horizontal_add_16x8(vec_accum);	231 return horizontal_add_16x8(vec_accum);

130 }	232 }

OLD	NEW

« no previous file with comments | « source/libvpx/vpx_dsp/arm/sad_media.asm ('k') | source/libvpx/vpx_dsp/sad.c » ('j') | no next file with comments »