Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(309)

Side by Side Diff: source/libvpx/vpx_dsp/arm/sad_neon.c

Issue 1124333011: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: only update to last nights LKGR Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/vpx_dsp/arm/sad_media.asm ('k') | source/libvpx/vpx_dsp/sad.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <arm_neon.h> 11 #include <arm_neon.h>
12 #include "./vp9_rtcd.h" 12
13 #include "./vpx_config.h" 13 #include "./vpx_config.h"
14 14
15 #include "vpx/vpx_integer.h" 15 #include "vpx/vpx_integer.h"
16 16
17 unsigned int vpx_sad8x16_neon(
18 unsigned char *src_ptr,
19 int src_stride,
20 unsigned char *ref_ptr,
21 int ref_stride) {
22 uint8x8_t d0, d8;
23 uint16x8_t q12;
24 uint32x4_t q1;
25 uint64x2_t q3;
26 uint32x2_t d5;
27 int i;
28
29 d0 = vld1_u8(src_ptr);
30 src_ptr += src_stride;
31 d8 = vld1_u8(ref_ptr);
32 ref_ptr += ref_stride;
33 q12 = vabdl_u8(d0, d8);
34
35 for (i = 0; i < 15; i++) {
36 d0 = vld1_u8(src_ptr);
37 src_ptr += src_stride;
38 d8 = vld1_u8(ref_ptr);
39 ref_ptr += ref_stride;
40 q12 = vabal_u8(q12, d0, d8);
41 }
42
43 q1 = vpaddlq_u16(q12);
44 q3 = vpaddlq_u32(q1);
45 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
46 vreinterpret_u32_u64(vget_high_u64(q3)));
47
48 return vget_lane_u32(d5, 0);
49 }
50
51 unsigned int vpx_sad4x4_neon(
52 unsigned char *src_ptr,
53 int src_stride,
54 unsigned char *ref_ptr,
55 int ref_stride) {
56 uint8x8_t d0, d8;
57 uint16x8_t q12;
58 uint32x2_t d1;
59 uint64x1_t d3;
60 int i;
61
62 d0 = vld1_u8(src_ptr);
63 src_ptr += src_stride;
64 d8 = vld1_u8(ref_ptr);
65 ref_ptr += ref_stride;
66 q12 = vabdl_u8(d0, d8);
67
68 for (i = 0; i < 3; i++) {
69 d0 = vld1_u8(src_ptr);
70 src_ptr += src_stride;
71 d8 = vld1_u8(ref_ptr);
72 ref_ptr += ref_stride;
73 q12 = vabal_u8(q12, d0, d8);
74 }
75
76 d1 = vpaddl_u16(vget_low_u16(q12));
77 d3 = vpaddl_u32(d1);
78
79 return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
80 }
81
82 unsigned int vpx_sad16x8_neon(
83 unsigned char *src_ptr,
84 int src_stride,
85 unsigned char *ref_ptr,
86 int ref_stride) {
87 uint8x16_t q0, q4;
88 uint16x8_t q12, q13;
89 uint32x4_t q1;
90 uint64x2_t q3;
91 uint32x2_t d5;
92 int i;
93
94 q0 = vld1q_u8(src_ptr);
95 src_ptr += src_stride;
96 q4 = vld1q_u8(ref_ptr);
97 ref_ptr += ref_stride;
98 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
99 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
100
101 for (i = 0; i < 7; i++) {
102 q0 = vld1q_u8(src_ptr);
103 src_ptr += src_stride;
104 q4 = vld1q_u8(ref_ptr);
105 ref_ptr += ref_stride;
106 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
107 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
108 }
109
110 q12 = vaddq_u16(q12, q13);
111 q1 = vpaddlq_u16(q12);
112 q3 = vpaddlq_u32(q1);
113 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
114 vreinterpret_u32_u64(vget_high_u64(q3)));
115
116 return vget_lane_u32(d5, 0);
117 }
118
17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, 119 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
18 const uint16x8_t vec_hi) { 120 const uint16x8_t vec_hi) {
19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), 121 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
20 vget_high_u16(vec_lo)); 122 vget_high_u16(vec_lo));
21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), 123 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
22 vget_high_u16(vec_hi)); 124 vget_high_u16(vec_hi));
23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); 125 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
24 const uint64x2_t b = vpaddlq_u32(a); 126 const uint64x2_t b = vpaddlq_u32(a);
25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), 127 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
26 vreinterpret_u32_u64(vget_high_u64(b))); 128 vreinterpret_u32_u64(vget_high_u64(b)));
27 return vget_lane_u32(c, 0); 129 return vget_lane_u32(c, 0);
28 } 130 }
29 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { 131 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
30 const uint32x4_t a = vpaddlq_u16(vec_16x8); 132 const uint32x4_t a = vpaddlq_u16(vec_16x8);
31 const uint64x2_t b = vpaddlq_u32(a); 133 const uint64x2_t b = vpaddlq_u32(a);
32 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), 134 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
33 vreinterpret_u32_u64(vget_high_u64(b))); 135 vreinterpret_u32_u64(vget_high_u64(b)));
34 return vget_lane_u32(c, 0); 136 return vget_lane_u32(c, 0);
35 } 137 }
36 138
37 unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, 139 unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
38 const uint8_t *ref, int ref_stride) { 140 const uint8_t *ref, int ref_stride) {
39 int i; 141 int i;
40 uint16x8_t vec_accum_lo = vdupq_n_u16(0); 142 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
41 uint16x8_t vec_accum_hi = vdupq_n_u16(0); 143 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
42 for (i = 0; i < 64; ++i) { 144 for (i = 0; i < 64; ++i) {
43 const uint8x16_t vec_src_00 = vld1q_u8(src); 145 const uint8x16_t vec_src_00 = vld1q_u8(src);
44 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); 146 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
45 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); 147 const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
46 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); 148 const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
47 const uint8x16_t vec_ref_00 = vld1q_u8(ref); 149 const uint8x16_t vec_ref_00 = vld1q_u8(ref);
(...skipping 15 matching lines...) Expand all
63 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), 165 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
64 vget_high_u8(vec_ref_32)); 166 vget_high_u8(vec_ref_32));
65 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), 167 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
66 vget_low_u8(vec_ref_48)); 168 vget_low_u8(vec_ref_48));
67 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), 169 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
68 vget_high_u8(vec_ref_48)); 170 vget_high_u8(vec_ref_48));
69 } 171 }
70 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); 172 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
71 } 173 }
72 174
73 unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, 175 unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
74 const uint8_t *ref, int ref_stride) { 176 const uint8_t *ref, int ref_stride) {
75 int i; 177 int i;
76 uint16x8_t vec_accum_lo = vdupq_n_u16(0); 178 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
77 uint16x8_t vec_accum_hi = vdupq_n_u16(0); 179 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
78 180
79 for (i = 0; i < 32; ++i) { 181 for (i = 0; i < 32; ++i) {
80 const uint8x16_t vec_src_00 = vld1q_u8(src); 182 const uint8x16_t vec_src_00 = vld1q_u8(src);
81 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); 183 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
82 const uint8x16_t vec_ref_00 = vld1q_u8(ref); 184 const uint8x16_t vec_ref_00 = vld1q_u8(ref);
83 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); 185 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
84 src += src_stride; 186 src += src_stride;
85 ref += ref_stride; 187 ref += ref_stride;
86 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), 188 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
87 vget_low_u8(vec_ref_00)); 189 vget_low_u8(vec_ref_00));
88 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), 190 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
89 vget_high_u8(vec_ref_00)); 191 vget_high_u8(vec_ref_00));
90 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), 192 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
91 vget_low_u8(vec_ref_16)); 193 vget_low_u8(vec_ref_16));
92 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), 194 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
93 vget_high_u8(vec_ref_16)); 195 vget_high_u8(vec_ref_16));
94 } 196 }
95 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); 197 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
96 } 198 }
97 199
98 unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, 200 unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
99 const uint8_t *ref, int ref_stride) { 201 const uint8_t *ref, int ref_stride) {
100 int i; 202 int i;
101 uint16x8_t vec_accum_lo = vdupq_n_u16(0); 203 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
102 uint16x8_t vec_accum_hi = vdupq_n_u16(0); 204 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
103 205
104 for (i = 0; i < 16; ++i) { 206 for (i = 0; i < 16; ++i) {
105 const uint8x16_t vec_src = vld1q_u8(src); 207 const uint8x16_t vec_src = vld1q_u8(src);
106 const uint8x16_t vec_ref = vld1q_u8(ref); 208 const uint8x16_t vec_ref = vld1q_u8(ref);
107 src += src_stride; 209 src += src_stride;
108 ref += ref_stride; 210 ref += ref_stride;
109 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), 211 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),
110 vget_low_u8(vec_ref)); 212 vget_low_u8(vec_ref));
111 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), 213 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
112 vget_high_u8(vec_ref)); 214 vget_high_u8(vec_ref));
113 } 215 }
114 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); 216 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
115 } 217 }
116 218
117 unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, 219 unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
118 const uint8_t *ref, int ref_stride) { 220 const uint8_t *ref, int ref_stride) {
119 int i; 221 int i;
120 uint16x8_t vec_accum = vdupq_n_u16(0); 222 uint16x8_t vec_accum = vdupq_n_u16(0);
121 223
122 for (i = 0; i < 8; ++i) { 224 for (i = 0; i < 8; ++i) {
123 const uint8x8_t vec_src = vld1_u8(src); 225 const uint8x8_t vec_src = vld1_u8(src);
124 const uint8x8_t vec_ref = vld1_u8(ref); 226 const uint8x8_t vec_ref = vld1_u8(ref);
125 src += src_stride; 227 src += src_stride;
126 ref += ref_stride; 228 ref += ref_stride;
127 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); 229 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
128 } 230 }
129 return horizontal_add_16x8(vec_accum); 231 return horizontal_add_16x8(vec_accum);
130 } 232 }
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/arm/sad_media.asm ('k') | source/libvpx/vpx_dsp/sad.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698