OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <arm_neon.h> | |
12 | |
13 unsigned int vp8_sad8x8_neon( | |
14 unsigned char *src_ptr, | |
15 int src_stride, | |
16 unsigned char *ref_ptr, | |
17 int ref_stride) { | |
18 uint8x8_t d0, d8; | |
19 uint16x8_t q12; | |
20 uint32x4_t q1; | |
21 uint64x2_t q3; | |
22 uint32x2_t d5; | |
23 int i; | |
24 | |
25 d0 = vld1_u8(src_ptr); | |
26 src_ptr += src_stride; | |
27 d8 = vld1_u8(ref_ptr); | |
28 ref_ptr += ref_stride; | |
29 q12 = vabdl_u8(d0, d8); | |
30 | |
31 for (i = 0; i < 7; i++) { | |
32 d0 = vld1_u8(src_ptr); | |
33 src_ptr += src_stride; | |
34 d8 = vld1_u8(ref_ptr); | |
35 ref_ptr += ref_stride; | |
36 q12 = vabal_u8(q12, d0, d8); | |
37 } | |
38 | |
39 q1 = vpaddlq_u16(q12); | |
40 q3 = vpaddlq_u32(q1); | |
41 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), | |
42 vreinterpret_u32_u64(vget_high_u64(q3))); | |
43 | |
44 return vget_lane_u32(d5, 0); | |
45 } | |
46 | |
47 unsigned int vp8_sad8x16_neon( | |
48 unsigned char *src_ptr, | |
49 int src_stride, | |
50 unsigned char *ref_ptr, | |
51 int ref_stride) { | |
52 uint8x8_t d0, d8; | |
53 uint16x8_t q12; | |
54 uint32x4_t q1; | |
55 uint64x2_t q3; | |
56 uint32x2_t d5; | |
57 int i; | |
58 | |
59 d0 = vld1_u8(src_ptr); | |
60 src_ptr += src_stride; | |
61 d8 = vld1_u8(ref_ptr); | |
62 ref_ptr += ref_stride; | |
63 q12 = vabdl_u8(d0, d8); | |
64 | |
65 for (i = 0; i < 15; i++) { | |
66 d0 = vld1_u8(src_ptr); | |
67 src_ptr += src_stride; | |
68 d8 = vld1_u8(ref_ptr); | |
69 ref_ptr += ref_stride; | |
70 q12 = vabal_u8(q12, d0, d8); | |
71 } | |
72 | |
73 q1 = vpaddlq_u16(q12); | |
74 q3 = vpaddlq_u32(q1); | |
75 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), | |
76 vreinterpret_u32_u64(vget_high_u64(q3))); | |
77 | |
78 return vget_lane_u32(d5, 0); | |
79 } | |
80 | |
81 unsigned int vp8_sad4x4_neon( | |
82 unsigned char *src_ptr, | |
83 int src_stride, | |
84 unsigned char *ref_ptr, | |
85 int ref_stride) { | |
86 uint8x8_t d0, d8; | |
87 uint16x8_t q12; | |
88 uint32x2_t d1; | |
89 uint64x1_t d3; | |
90 int i; | |
91 | |
92 d0 = vld1_u8(src_ptr); | |
93 src_ptr += src_stride; | |
94 d8 = vld1_u8(ref_ptr); | |
95 ref_ptr += ref_stride; | |
96 q12 = vabdl_u8(d0, d8); | |
97 | |
98 for (i = 0; i < 3; i++) { | |
99 d0 = vld1_u8(src_ptr); | |
100 src_ptr += src_stride; | |
101 d8 = vld1_u8(ref_ptr); | |
102 ref_ptr += ref_stride; | |
103 q12 = vabal_u8(q12, d0, d8); | |
104 } | |
105 | |
106 d1 = vpaddl_u16(vget_low_u16(q12)); | |
107 d3 = vpaddl_u32(d1); | |
108 | |
109 return vget_lane_u32(vreinterpret_u32_u64(d3), 0); | |
110 } | |
111 | |
112 unsigned int vp8_sad16x16_neon( | |
113 unsigned char *src_ptr, | |
114 int src_stride, | |
115 unsigned char *ref_ptr, | |
116 int ref_stride) { | |
117 uint8x16_t q0, q4; | |
118 uint16x8_t q12, q13; | |
119 uint32x4_t q1; | |
120 uint64x2_t q3; | |
121 uint32x2_t d5; | |
122 int i; | |
123 | |
124 q0 = vld1q_u8(src_ptr); | |
125 src_ptr += src_stride; | |
126 q4 = vld1q_u8(ref_ptr); | |
127 ref_ptr += ref_stride; | |
128 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); | |
129 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); | |
130 | |
131 for (i = 0; i < 15; i++) { | |
132 q0 = vld1q_u8(src_ptr); | |
133 src_ptr += src_stride; | |
134 q4 = vld1q_u8(ref_ptr); | |
135 ref_ptr += ref_stride; | |
136 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); | |
137 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); | |
138 } | |
139 | |
140 q12 = vaddq_u16(q12, q13); | |
141 q1 = vpaddlq_u16(q12); | |
142 q3 = vpaddlq_u32(q1); | |
143 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), | |
144 vreinterpret_u32_u64(vget_high_u64(q3))); | |
145 | |
146 return vget_lane_u32(d5, 0); | |
147 } | |
148 | |
149 unsigned int vp8_sad16x8_neon( | |
150 unsigned char *src_ptr, | |
151 int src_stride, | |
152 unsigned char *ref_ptr, | |
153 int ref_stride) { | |
154 uint8x16_t q0, q4; | |
155 uint16x8_t q12, q13; | |
156 uint32x4_t q1; | |
157 uint64x2_t q3; | |
158 uint32x2_t d5; | |
159 int i; | |
160 | |
161 q0 = vld1q_u8(src_ptr); | |
162 src_ptr += src_stride; | |
163 q4 = vld1q_u8(ref_ptr); | |
164 ref_ptr += ref_stride; | |
165 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); | |
166 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); | |
167 | |
168 for (i = 0; i < 7; i++) { | |
169 q0 = vld1q_u8(src_ptr); | |
170 src_ptr += src_stride; | |
171 q4 = vld1q_u8(ref_ptr); | |
172 ref_ptr += ref_stride; | |
173 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); | |
174 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); | |
175 } | |
176 | |
177 q12 = vaddq_u16(q12, q13); | |
178 q1 = vpaddlq_u16(q12); | |
179 q3 = vpaddlq_u32(q1); | |
180 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), | |
181 vreinterpret_u32_u64(vget_high_u64(q3))); | |
182 | |
183 return vget_lane_u32(d5, 0); | |
184 } | |
OLD | NEW |