Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stddef.h>
12 #include <arm_neon.h>
13
14 void vp9_v_predictor_4x4_neon(
15 uint8_t *dst,
16 ptrdiff_t y_stride,
17 const uint8_t *above,
18 const uint8_t *left) {
19 int i;
20 uint32x2_t d0u32 = vdup_n_u32(0);
21 (void)left;
22
23 d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
24 for (i = 0; i < 4; i++, dst += y_stride)
25 vst1_lane_u32((uint32_t *)dst, d0u32, 0);
26 return;
27 }
28
29 void vp9_v_predictor_8x8_neon(
30 uint8_t *dst,
31 ptrdiff_t y_stride,
32 const uint8_t *above,
33 const uint8_t *left) {
34 int i;
35 uint8x8_t d0u8 = vdup_n_u8(0);
36 (void)left;
37
38 d0u8 = vld1_u8(above);
39 for (i = 0; i < 8; i++, dst += y_stride)
40 vst1_u8(dst, d0u8);
41 return;
42 }
43
44 void vp9_v_predictor_16x16_neon(
45 uint8_t *dst,
46 ptrdiff_t y_stride,
47 const uint8_t *above,
48 const uint8_t *left) {
49 int i;
50 uint8x16_t q0u8 = vdupq_n_u8(0);
51 (void)left;
52
53 q0u8 = vld1q_u8(above);
54 for (i = 0; i < 16; i++, dst += y_stride)
55 vst1q_u8(dst, q0u8);
56 return;
57 }
58
59 void vp9_v_predictor_32x32_neon(
60 uint8_t *dst,
61 ptrdiff_t y_stride,
62 const uint8_t *above,
63 const uint8_t *left) {
64 int i;
65 uint8x16_t q0u8 = vdupq_n_u8(0);
66 uint8x16_t q1u8 = vdupq_n_u8(0);
67 (void)left;
68
69 q0u8 = vld1q_u8(above);
70 q1u8 = vld1q_u8(above + 16);
71 for (i = 0; i < 32; i++, dst += y_stride) {
72 vst1q_u8(dst, q0u8);
73 vst1q_u8(dst + 16, q1u8);
74 }
75 return;
76 }
77
78 void vp9_h_predictor_4x4_neon(
79 uint8_t *dst,
80 ptrdiff_t y_stride,
81 const uint8_t *above,
82 const uint8_t *left) {
83 uint8x8_t d0u8 = vdup_n_u8(0);
84 uint32x2_t d1u32 = vdup_n_u32(0);
85 (void)above;
86
87 d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
88
89 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
90 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
91 dst += y_stride;
92 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
93 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
94 dst += y_stride;
95 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
96 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
97 dst += y_stride;
98 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
99 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
100 return;
101 }
102
103 void vp9_h_predictor_8x8_neon(
104 uint8_t *dst,
105 ptrdiff_t y_stride,
106 const uint8_t *above,
107 const uint8_t *left) {
108 uint8x8_t d0u8 = vdup_n_u8(0);
109 uint64x1_t d1u64 = vdup_n_u64(0);
110 (void)above;
111
112 d1u64 = vld1_u64((const uint64_t *)left);
113
114 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
115 vst1_u8(dst, d0u8);
116 dst += y_stride;
117 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
118 vst1_u8(dst, d0u8);
119 dst += y_stride;
120 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
121 vst1_u8(dst, d0u8);
122 dst += y_stride;
123 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
124 vst1_u8(dst, d0u8);
125 dst += y_stride;
126 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
127 vst1_u8(dst, d0u8);
128 dst += y_stride;
129 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
130 vst1_u8(dst, d0u8);
131 dst += y_stride;
132 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
133 vst1_u8(dst, d0u8);
134 dst += y_stride;
135 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
136 vst1_u8(dst, d0u8);
137 return;
138 }
139
140 void vp9_h_predictor_16x16_neon(
141 uint8_t *dst,
142 ptrdiff_t y_stride,
143 const uint8_t *above,
144 const uint8_t *left) {
145 int j;
146 uint8x8_t d2u8 = vdup_n_u8(0);
147 uint8x16_t q0u8 = vdupq_n_u8(0);
148 uint8x16_t q1u8 = vdupq_n_u8(0);
149 (void)above;
150
151 q1u8 = vld1q_u8(left);
152 d2u8 = vget_low_u8(q1u8);
153 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
154 q0u8 = vdupq_lane_u8(d2u8, 0);
155 vst1q_u8(dst, q0u8);
156 dst += y_stride;
157 q0u8 = vdupq_lane_u8(d2u8, 1);
158 vst1q_u8(dst, q0u8);
159 dst += y_stride;
160 q0u8 = vdupq_lane_u8(d2u8, 2);
161 vst1q_u8(dst, q0u8);
162 dst += y_stride;
163 q0u8 = vdupq_lane_u8(d2u8, 3);
164 vst1q_u8(dst, q0u8);
165 dst += y_stride;
166 q0u8 = vdupq_lane_u8(d2u8, 4);
167 vst1q_u8(dst, q0u8);
168 dst += y_stride;
169 q0u8 = vdupq_lane_u8(d2u8, 5);
170 vst1q_u8(dst, q0u8);
171 dst += y_stride;
172 q0u8 = vdupq_lane_u8(d2u8, 6);
173 vst1q_u8(dst, q0u8);
174 dst += y_stride;
175 q0u8 = vdupq_lane_u8(d2u8, 7);
176 vst1q_u8(dst, q0u8);
177 dst += y_stride;
178 }
179 return;
180 }
181
182 void vp9_h_predictor_32x32_neon(
183 uint8_t *dst,
184 ptrdiff_t y_stride,
185 const uint8_t *above,
186 const uint8_t *left) {
187 int j, k;
188 uint8x8_t d2u8 = vdup_n_u8(0);
189 uint8x16_t q0u8 = vdupq_n_u8(0);
190 uint8x16_t q1u8 = vdupq_n_u8(0);
191 (void)above;
192
193 for (k = 0; k < 2; k++, left += 16) {
194 q1u8 = vld1q_u8(left);
195 d2u8 = vget_low_u8(q1u8);
196 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
197 q0u8 = vdupq_lane_u8(d2u8, 0);
198 vst1q_u8(dst, q0u8);
199 vst1q_u8(dst + 16, q0u8);
200 dst += y_stride;
201 q0u8 = vdupq_lane_u8(d2u8, 1);
202 vst1q_u8(dst, q0u8);
203 vst1q_u8(dst + 16, q0u8);
204 dst += y_stride;
205 q0u8 = vdupq_lane_u8(d2u8, 2);
206 vst1q_u8(dst, q0u8);
207 vst1q_u8(dst + 16, q0u8);
208 dst += y_stride;
209 q0u8 = vdupq_lane_u8(d2u8, 3);
210 vst1q_u8(dst, q0u8);
211 vst1q_u8(dst + 16, q0u8);
212 dst += y_stride;
213 q0u8 = vdupq_lane_u8(d2u8, 4);
214 vst1q_u8(dst, q0u8);
215 vst1q_u8(dst + 16, q0u8);
216 dst += y_stride;
217 q0u8 = vdupq_lane_u8(d2u8, 5);
218 vst1q_u8(dst, q0u8);
219 vst1q_u8(dst + 16, q0u8);
220 dst += y_stride;
221 q0u8 = vdupq_lane_u8(d2u8, 6);
222 vst1q_u8(dst, q0u8);
223 vst1q_u8(dst + 16, q0u8);
224 dst += y_stride;
225 q0u8 = vdupq_lane_u8(d2u8, 7);
226 vst1q_u8(dst, q0u8);
227 vst1q_u8(dst + 16, q0u8);
228 dst += y_stride;
229 }
230 }
231 return;
232 }
233
234 void vp9_tm_predictor_4x4_neon(
235 uint8_t *dst,
236 ptrdiff_t y_stride,
237 const uint8_t *above,
238 const uint8_t *left) {
239 int i;
240 uint16x8_t q1u16, q3u16;
241 int16x8_t q1s16;
242 uint8x8_t d0u8 = vdup_n_u8(0);
243 uint32x2_t d2u32 = vdup_n_u32(0);
244
245 d0u8 = vdup_n_u8(above[-1]);
246 d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
247 q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
248 for (i = 0; i < 4; i++, dst += y_stride) {
249 q1u16 = vdupq_n_u16((uint16_t)left[i]);
250 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
251 vreinterpretq_s16_u16(q3u16));
252 d0u8 = vqmovun_s16(q1s16);
253 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
254 }
255 return;
256 }
257
258 void vp9_tm_predictor_8x8_neon(
259 uint8_t *dst,
260 ptrdiff_t y_stride,
261 const uint8_t *above,
262 const uint8_t *left) {
263 int j;
264 uint16x8_t q0u16, q3u16, q10u16;
265 int16x8_t q0s16;
266 uint16x4_t d20u16;
267 uint8x8_t d0u8, d2u8, d30u8;
268
269 d0u8 = vdup_n_u8(above[-1]);
270 d30u8 = vld1_u8(left);
271 d2u8 = vld1_u8(above);
272 q10u16 = vmovl_u8(d30u8);
273 q3u16 = vsubl_u8(d2u8, d0u8);
274 d20u16 = vget_low_u16(q10u16);
275 for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
276 q0u16 = vdupq_lane_u16(d20u16, 0);
277 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
278 vreinterpretq_s16_u16(q0u16));
279 d0u8 = vqmovun_s16(q0s16);
280 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
281 dst += y_stride;
282 q0u16 = vdupq_lane_u16(d20u16, 1);
283 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
284 vreinterpretq_s16_u16(q0u16));
285 d0u8 = vqmovun_s16(q0s16);
286 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
287 dst += y_stride;
288 q0u16 = vdupq_lane_u16(d20u16, 2);
289 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
290 vreinterpretq_s16_u16(q0u16));
291 d0u8 = vqmovun_s16(q0s16);
292 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
293 dst += y_stride;
294 q0u16 = vdupq_lane_u16(d20u16, 3);
295 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
296 vreinterpretq_s16_u16(q0u16));
297 d0u8 = vqmovun_s16(q0s16);
298 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
299 dst += y_stride;
300 }
301 return;
302 }
303
304 void vp9_tm_predictor_16x16_neon(
305 uint8_t *dst,
306 ptrdiff_t y_stride,
307 const uint8_t *above,
308 const uint8_t *left) {
309 int j, k;
310 uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
311 uint8x16_t q0u8, q1u8;
312 int16x8_t q0s16, q1s16, q8s16, q11s16;
313 uint16x4_t d20u16;
314 uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
315
316 q0u8 = vdupq_n_u8(above[-1]);
317 q1u8 = vld1q_u8(above);
318 q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
319 q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
320 for (k = 0; k < 2; k++, left += 8) {
321 d18u8 = vld1_u8(left);
322 q10u16 = vmovl_u8(d18u8);
323 d20u16 = vget_low_u16(q10u16);
324 for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
325 q0u16 = vdupq_lane_u16(d20u16, 0);
326 q8u16 = vdupq_lane_u16(d20u16, 1);
327 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
328 vreinterpretq_s16_u16(q2u16));
329 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
330 vreinterpretq_s16_u16(q3u16));
331 q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
332 vreinterpretq_s16_u16(q2u16));
333 q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
334 vreinterpretq_s16_u16(q3u16));
335 d2u8 = vqmovun_s16(q1s16);
336 d3u8 = vqmovun_s16(q0s16);
337 d22u8 = vqmovun_s16(q11s16);
338 d23u8 = vqmovun_s16(q8s16);
339 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
340 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
341 dst += y_stride;
342 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
343 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
344 dst += y_stride;
345
346 q0u16 = vdupq_lane_u16(d20u16, 2);
347 q8u16 = vdupq_lane_u16(d20u16, 3);
348 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
349 vreinterpretq_s16_u16(q2u16));
350 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
351 vreinterpretq_s16_u16(q3u16));
352 q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
353 vreinterpretq_s16_u16(q2u16));
354 q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
355 vreinterpretq_s16_u16(q3u16));
356 d2u8 = vqmovun_s16(q1s16);
357 d3u8 = vqmovun_s16(q0s16);
358 d22u8 = vqmovun_s16(q11s16);
359 d23u8 = vqmovun_s16(q8s16);
360 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
361 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
362 dst += y_stride;
363 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
364 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
365 dst += y_stride;
366 }
367 }
368 return;
369 }
370
371 void vp9_tm_predictor_32x32_neon(
372 uint8_t *dst,
373 ptrdiff_t y_stride,
374 const uint8_t *above,
375 const uint8_t *left) {
376 int j, k;
377 uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
378 uint8x16_t q0u8, q1u8, q2u8;
379 int16x8_t q12s16, q13s16, q14s16, q15s16;
380 uint16x4_t d6u16;
381 uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
382
383 q0u8 = vdupq_n_u8(above[-1]);
384 q1u8 = vld1q_u8(above);
385 q2u8 = vld1q_u8(above + 16);
386 q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
387 q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
388 q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
389 q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
390 for (k = 0; k < 4; k++, left += 8) {
391 d26u8 = vld1_u8(left);
392 q3u16 = vmovl_u8(d26u8);
393 d6u16 = vget_low_u16(q3u16);
394 for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
395 q0u16 = vdupq_lane_u16(d6u16, 0);
396 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
397 vreinterpretq_s16_u16(q8u16));
398 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
399 vreinterpretq_s16_u16(q9u16));
400 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
401 vreinterpretq_s16_u16(q10u16));
402 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
403 vreinterpretq_s16_u16(q11u16));
404 d0u8 = vqmovun_s16(q12s16);
405 d1u8 = vqmovun_s16(q13s16);
406 d2u8 = vqmovun_s16(q14s16);
407 d3u8 = vqmovun_s16(q15s16);
408 q0u8 = vcombine_u8(d0u8, d1u8);
409 q1u8 = vcombine_u8(d2u8, d3u8);
410 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
411 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
412 dst += y_stride;
413
414 q0u16 = vdupq_lane_u16(d6u16, 1);
415 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
416 vreinterpretq_s16_u16(q8u16));
417 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
418 vreinterpretq_s16_u16(q9u16));
419 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
420 vreinterpretq_s16_u16(q10u16));
421 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
422 vreinterpretq_s16_u16(q11u16));
423 d0u8 = vqmovun_s16(q12s16);
424 d1u8 = vqmovun_s16(q13s16);
425 d2u8 = vqmovun_s16(q14s16);
426 d3u8 = vqmovun_s16(q15s16);
427 q0u8 = vcombine_u8(d0u8, d1u8);
428 q1u8 = vcombine_u8(d2u8, d3u8);
429 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
430 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
431 dst += y_stride;
432
433 q0u16 = vdupq_lane_u16(d6u16, 2);
434 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
435 vreinterpretq_s16_u16(q8u16));
436 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
437 vreinterpretq_s16_u16(q9u16));
438 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
439 vreinterpretq_s16_u16(q10u16));
440 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
441 vreinterpretq_s16_u16(q11u16));
442 d0u8 = vqmovun_s16(q12s16);
443 d1u8 = vqmovun_s16(q13s16);
444 d2u8 = vqmovun_s16(q14s16);
445 d3u8 = vqmovun_s16(q15s16);
446 q0u8 = vcombine_u8(d0u8, d1u8);
447 q1u8 = vcombine_u8(d2u8, d3u8);
448 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
449 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
450 dst += y_stride;
451
452 q0u16 = vdupq_lane_u16(d6u16, 3);
453 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
454 vreinterpretq_s16_u16(q8u16));
455 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
456 vreinterpretq_s16_u16(q9u16));
457 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
458 vreinterpretq_s16_u16(q10u16));
459 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
460 vreinterpretq_s16_u16(q11u16));
461 d0u8 = vqmovun_s16(q12s16);
462 d1u8 = vqmovun_s16(q13s16);
463 d2u8 = vqmovun_s16(q14s16);
464 d3u8 = vqmovun_s16(q15s16);
465 q0u8 = vcombine_u8(d0u8, d1u8);
466 q1u8 = vcombine_u8(d2u8, d3u8);
467 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
468 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
469 dst += y_stride;
470 }
471 }
472 return;
473 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698