Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(450)

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_neon64.cc

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_win.cc ('k') | source/libvpx/tools_common.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20 #ifdef HAS_SCALEROWDOWN2_NEON
21 // Read 32x1 throw away even pixels, and write 16x1.
22 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
23 uint8* dst, int dst_width) {
24 asm volatile (
25 ".p2align 2 \n"
26 "1: \n"
27 // load even pixels into q0, odd into q1
28 MEMACCESS(0)
29 "vld2.8 {q0, q1}, [%0]! \n"
30 "subs %2, %2, #16 \n" // 16 processed per loop
31 MEMACCESS(1)
32 "vst1.8 {q1}, [%1]! \n" // store odd pixels
33 "bgt 1b \n"
34 : "+r"(src_ptr), // %0
35 "+r"(dst), // %1
36 "+r"(dst_width) // %2
37 :
38 : "q0", "q1" // Clobber List
39 );
40 }
41 #endif //HAS_SCALEROWDOWN2_NEON
42
43 #ifdef HAS_SCALEROWDOWN2_NEON
44 // Read 32x2 average down and write 16x1.
45 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
46 uint8* dst, int dst_width) {
47 asm volatile (
48 // change the stride to row 2 pointer
49 "add %1, %0 \n"
50 ".p2align 2 \n"
51 "1: \n"
52 MEMACCESS(0)
53 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
54 MEMACCESS(1)
55 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
56 "subs %3, %3, #16 \n" // 16 processed per loop
57 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
58 "vpaddl.u8 q1, q1 \n"
59 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
60 "vpadal.u8 q1, q3 \n"
61 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
62 "vrshrn.u16 d1, q1, #2 \n"
63 MEMACCESS(2)
64 "vst1.8 {q0}, [%2]! \n"
65 "bgt 1b \n"
66 : "+r"(src_ptr), // %0
67 "+r"(src_stride), // %1
68 "+r"(dst), // %2
69 "+r"(dst_width) // %3
70 :
71 : "q0", "q1", "q2", "q3" // Clobber List
72 );
73 }
74 #endif //HAS_SCALEROWDOWN2_NEON
75
76 #ifdef HAS_SCALEROWDOWN4_NEON
77 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
78 uint8* dst_ptr, int dst_width) {
79 asm volatile (
80 ".p2align 2 \n"
81 "1: \n"
82 MEMACCESS(0)
83 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
84 "subs %2, %2, #8 \n" // 8 processed per loop
85 MEMACCESS(1)
86 "vst1.8 {d2}, [%1]! \n"
87 "bgt 1b \n"
88 : "+r"(src_ptr), // %0
89 "+r"(dst_ptr), // %1
90 "+r"(dst_width) // %2
91 :
92 : "q0", "q1", "memory", "cc"
93 );
94 }
95 #endif //HAS_SCALEROWDOWN4_NEON
96
97 #ifdef HAS_SCALEROWDOWN4_NEON
98 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
99 uint8* dst_ptr, int dst_width) {
100 const uint8* src_ptr1 = src_ptr + src_stride;
101 const uint8* src_ptr2 = src_ptr + src_stride * 2;
102 const uint8* src_ptr3 = src_ptr + src_stride * 3;
103 asm volatile (
104 ".p2align 2 \n"
105 "1: \n"
106 MEMACCESS(0)
107 "vld1.8 {q0}, [%0]! \n" // load up 16x4
108 MEMACCESS(3)
109 "vld1.8 {q1}, [%3]! \n"
110 MEMACCESS(4)
111 "vld1.8 {q2}, [%4]! \n"
112 MEMACCESS(5)
113 "vld1.8 {q3}, [%5]! \n"
114 "subs %2, %2, #4 \n"
115 "vpaddl.u8 q0, q0 \n"
116 "vpadal.u8 q0, q1 \n"
117 "vpadal.u8 q0, q2 \n"
118 "vpadal.u8 q0, q3 \n"
119 "vpaddl.u16 q0, q0 \n"
120 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
121 "vmovn.u16 d0, q0 \n"
122 MEMACCESS(1)
123 "vst1.32 {d0[0]}, [%1]! \n"
124 "bgt 1b \n"
125 : "+r"(src_ptr), // %0
126 "+r"(dst_ptr), // %1
127 "+r"(dst_width), // %2
128 "+r"(src_ptr1), // %3
129 "+r"(src_ptr2), // %4
130 "+r"(src_ptr3) // %5
131 :
132 : "q0", "q1", "q2", "q3", "memory", "cc"
133 );
134 }
135 #endif //HAS_SCALEROWDOWN4_NEON
136
137 #ifdef HAS_SCALEROWDOWN34_NEON
138 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
139 // to load up the every 4th pixel into a 4 different registers.
140 // Point samples 32 pixels to 24 pixels.
141 void ScaleRowDown34_NEON(const uint8* src_ptr,
142 ptrdiff_t src_stride,
143 uint8* dst_ptr, int dst_width) {
144 asm volatile (
145 ".p2align 2 \n"
146 "1: \n"
147 MEMACCESS(0)
148 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
149 "subs %2, %2, #24 \n"
150 "vmov d2, d3 \n" // order d0, d1, d2
151 MEMACCESS(1)
152 "vst3.8 {d0, d1, d2}, [%1]! \n"
153 "bgt 1b \n"
154 : "+r"(src_ptr), // %0
155 "+r"(dst_ptr), // %1
156 "+r"(dst_width) // %2
157 :
158 : "d0", "d1", "d2", "d3", "memory", "cc"
159 );
160 }
161 #endif //HAS_SCALEROWDOWN34_NEON
162
163 #ifdef HAS_SCALEROWDOWN34_NEON
164 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
165 ptrdiff_t src_stride,
166 uint8* dst_ptr, int dst_width) {
167 asm volatile (
168 "vmov.u8 d24, #3 \n"
169 "add %3, %0 \n"
170 ".p2align 2 \n"
171 "1: \n"
172 MEMACCESS(0)
173 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
174 MEMACCESS(3)
175 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
176 "subs %2, %2, #24 \n"
177
178 // filter src line 0 with src line 1
179 // expand chars to shorts to allow for room
180 // when adding lines together
181 "vmovl.u8 q8, d4 \n"
182 "vmovl.u8 q9, d5 \n"
183 "vmovl.u8 q10, d6 \n"
184 "vmovl.u8 q11, d7 \n"
185
186 // 3 * line_0 + line_1
187 "vmlal.u8 q8, d0, d24 \n"
188 "vmlal.u8 q9, d1, d24 \n"
189 "vmlal.u8 q10, d2, d24 \n"
190 "vmlal.u8 q11, d3, d24 \n"
191
192 // (3 * line_0 + line_1) >> 2
193 "vqrshrn.u16 d0, q8, #2 \n"
194 "vqrshrn.u16 d1, q9, #2 \n"
195 "vqrshrn.u16 d2, q10, #2 \n"
196 "vqrshrn.u16 d3, q11, #2 \n"
197
198 // a0 = (src[0] * 3 + s[1] * 1) >> 2
199 "vmovl.u8 q8, d1 \n"
200 "vmlal.u8 q8, d0, d24 \n"
201 "vqrshrn.u16 d0, q8, #2 \n"
202
203 // a1 = (src[1] * 1 + s[2] * 1) >> 1
204 "vrhadd.u8 d1, d1, d2 \n"
205
206 // a2 = (src[2] * 1 + s[3] * 3) >> 2
207 "vmovl.u8 q8, d2 \n"
208 "vmlal.u8 q8, d3, d24 \n"
209 "vqrshrn.u16 d2, q8, #2 \n"
210
211 MEMACCESS(1)
212 "vst3.8 {d0, d1, d2}, [%1]! \n"
213
214 "bgt 1b \n"
215 : "+r"(src_ptr), // %0
216 "+r"(dst_ptr), // %1
217 "+r"(dst_width), // %2
218 "+r"(src_stride) // %3
219 :
220 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
221 );
222 }
223 #endif //ScaleRowDown34_0_Box_NEON
224
225 #ifdef HAS_SCALEROWDOWN34_NEON
226 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
227 ptrdiff_t src_stride,
228 uint8* dst_ptr, int dst_width) {
229 asm volatile (
230 "vmov.u8 d24, #3 \n"
231 "add %3, %0 \n"
232 ".p2align 2 \n"
233 "1: \n"
234 MEMACCESS(0)
235 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
236 MEMACCESS(3)
237 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
238 "subs %2, %2, #24 \n"
239 // average src line 0 with src line 1
240 "vrhadd.u8 q0, q0, q2 \n"
241 "vrhadd.u8 q1, q1, q3 \n"
242
243 // a0 = (src[0] * 3 + s[1] * 1) >> 2
244 "vmovl.u8 q3, d1 \n"
245 "vmlal.u8 q3, d0, d24 \n"
246 "vqrshrn.u16 d0, q3, #2 \n"
247
248 // a1 = (src[1] * 1 + s[2] * 1) >> 1
249 "vrhadd.u8 d1, d1, d2 \n"
250
251 // a2 = (src[2] * 1 + s[3] * 3) >> 2
252 "vmovl.u8 q3, d2 \n"
253 "vmlal.u8 q3, d3, d24 \n"
254 "vqrshrn.u16 d2, q3, #2 \n"
255
256 MEMACCESS(1)
257 "vst3.8 {d0, d1, d2}, [%1]! \n"
258 "bgt 1b \n"
259 : "+r"(src_ptr), // %0
260 "+r"(dst_ptr), // %1
261 "+r"(dst_width), // %2
262 "+r"(src_stride) // %3
263 :
264 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
265 );
266 }
267 #endif //HAS_SCALEROWDOWN34_NEON
268
269 #ifdef HAS_SCALEROWDOWN38_NEON
270 #define HAS_SCALEROWDOWN38_NEON
271 static uvec8 kShuf38 =
272 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
273 static uvec8 kShuf38_2 =
274 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
275 static vec16 kMult38_Div6 =
276 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
277 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
278 static vec16 kMult38_Div9 =
279 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
280 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
281
282 // 32 -> 12
283 void ScaleRowDown38_NEON(const uint8* src_ptr,
284 ptrdiff_t src_stride,
285 uint8* dst_ptr, int dst_width) {
286 asm volatile (
287 MEMACCESS(3)
288 "vld1.8 {q3}, [%3] \n"
289 ".p2align 2 \n"
290 "1: \n"
291 MEMACCESS(0)
292 "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
293 "subs %2, %2, #12 \n"
294 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
295 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
296 MEMACCESS(1)
297 "vst1.8 {d4}, [%1]! \n"
298 MEMACCESS(1)
299 "vst1.32 {d5[0]}, [%1]! \n"
300 "bgt 1b \n"
301 : "+r"(src_ptr), // %0
302 "+r"(dst_ptr), // %1
303 "+r"(dst_width) // %2
304 : "r"(&kShuf38) // %3
305 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
306 );
307 }
308
309 #endif //HAS_SCALEROWDOWN38_NEON
310
311 #ifdef HAS_SCALEROWDOWN38_NEON
312 // 32x3 -> 12x1
313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
314 ptrdiff_t src_stride,
315 uint8* dst_ptr, int dst_width) {
316 const uint8* src_ptr1 = src_ptr + src_stride * 2;
317
318 asm volatile (
319 MEMACCESS(5)
320 "vld1.16 {q13}, [%5] \n"
321 MEMACCESS(6)
322 "vld1.8 {q14}, [%6] \n"
323 MEMACCESS(7)
324 "vld1.8 {q15}, [%7] \n"
325 "add %3, %0 \n"
326 ".p2align 2 \n"
327 "1: \n"
328
329 // d0 = 00 40 01 41 02 42 03 43
330 // d1 = 10 50 11 51 12 52 13 53
331 // d2 = 20 60 21 61 22 62 23 63
332 // d3 = 30 70 31 71 32 72 33 73
333 MEMACCESS(0)
334 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
335 MEMACCESS(3)
336 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
337 MEMACCESS(4)
338 "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
339 "subs %2, %2, #12 \n"
340
341 // Shuffle the input data around to get align the data
342 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
343 // d0 = 00 10 01 11 02 12 03 13
344 // d1 = 40 50 41 51 42 52 43 53
345 "vtrn.u8 d0, d1 \n"
346 "vtrn.u8 d4, d5 \n"
347 "vtrn.u8 d16, d17 \n"
348
349 // d2 = 20 30 21 31 22 32 23 33
350 // d3 = 60 70 61 71 62 72 63 73
351 "vtrn.u8 d2, d3 \n"
352 "vtrn.u8 d6, d7 \n"
353 "vtrn.u8 d18, d19 \n"
354
355 // d0 = 00+10 01+11 02+12 03+13
356 // d2 = 40+50 41+51 42+52 43+53
357 "vpaddl.u8 q0, q0 \n"
358 "vpaddl.u8 q2, q2 \n"
359 "vpaddl.u8 q8, q8 \n"
360
361 // d3 = 60+70 61+71 62+72 63+73
362 "vpaddl.u8 d3, d3 \n"
363 "vpaddl.u8 d7, d7 \n"
364 "vpaddl.u8 d19, d19 \n"
365
366 // combine source lines
367 "vadd.u16 q0, q2 \n"
368 "vadd.u16 q0, q8 \n"
369 "vadd.u16 d4, d3, d7 \n"
370 "vadd.u16 d4, d19 \n"
371
372 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
373 // + s[6 + st * 1] + s[7 + st * 1]
374 // + s[6 + st * 2] + s[7 + st * 2]) / 6
375 "vqrdmulh.s16 q2, q2, q13 \n"
376 "vmovn.u16 d4, q2 \n"
377
378 // Shuffle 2,3 reg around so that 2 can be added to the
379 // 0,1 reg and 3 can be added to the 4,5 reg. This
380 // requires expanding from u8 to u16 as the 0,1 and 4,5
381 // registers are already expanded. Then do transposes
382 // to get aligned.
383 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
384 "vmovl.u8 q1, d2 \n"
385 "vmovl.u8 q3, d6 \n"
386 "vmovl.u8 q9, d18 \n"
387
388 // combine source lines
389 "vadd.u16 q1, q3 \n"
390 "vadd.u16 q1, q9 \n"
391
392 // d4 = xx 20 xx 30 xx 22 xx 32
393 // d5 = xx 21 xx 31 xx 23 xx 33
394 "vtrn.u32 d2, d3 \n"
395
396 // d4 = xx 20 xx 21 xx 22 xx 23
397 // d5 = xx 30 xx 31 xx 32 xx 33
398 "vtrn.u16 d2, d3 \n"
399
400 // 0+1+2, 3+4+5
401 "vadd.u16 q0, q1 \n"
402
403 // Need to divide, but can't downshift as the the value
404 // isn't a power of 2. So multiply by 65536 / n
405 // and take the upper 16 bits.
406 "vqrdmulh.s16 q0, q0, q15 \n"
407
408 // Align for table lookup, vtbl requires registers to
409 // be adjacent
410 "vmov.u8 d2, d4 \n"
411
412 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
413 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
414
415 MEMACCESS(1)
416 "vst1.8 {d3}, [%1]! \n"
417 MEMACCESS(1)
418 "vst1.32 {d4[0]}, [%1]! \n"
419 "bgt 1b \n"
420 : "+r"(src_ptr), // %0
421 "+r"(dst_ptr), // %1
422 "+r"(dst_width), // %2
423 "+r"(src_stride), // %3
424 "+r"(src_ptr1) // %4
425 : "r"(&kMult38_Div6), // %5
426 "r"(&kShuf38_2), // %6
427 "r"(&kMult38_Div9) // %7
428 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
429 );
430 }
431 #endif //HAS_SCALEROWDOWN38_NEON
432
433 #ifdef HAS_SCALEROWDOWN38_NEON
434 // 32x2 -> 12x1
435 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
436 ptrdiff_t src_stride,
437 uint8* dst_ptr, int dst_width) {
438 asm volatile (
439 MEMACCESS(4)
440 "vld1.16 {q13}, [%4] \n"
441 MEMACCESS(5)
442 "vld1.8 {q14}, [%5] \n"
443 "add %3, %0 \n"
444 ".p2align 2 \n"
445 "1: \n"
446
447 // d0 = 00 40 01 41 02 42 03 43
448 // d1 = 10 50 11 51 12 52 13 53
449 // d2 = 20 60 21 61 22 62 23 63
450 // d3 = 30 70 31 71 32 72 33 73
451 MEMACCESS(0)
452 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
453 MEMACCESS(3)
454 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
455 "subs %2, %2, #12 \n"
456
457 // Shuffle the input data around to get align the data
458 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
459 // d0 = 00 10 01 11 02 12 03 13
460 // d1 = 40 50 41 51 42 52 43 53
461 "vtrn.u8 d0, d1 \n"
462 "vtrn.u8 d4, d5 \n"
463
464 // d2 = 20 30 21 31 22 32 23 33
465 // d3 = 60 70 61 71 62 72 63 73
466 "vtrn.u8 d2, d3 \n"
467 "vtrn.u8 d6, d7 \n"
468
469 // d0 = 00+10 01+11 02+12 03+13
470 // d2 = 40+50 41+51 42+52 43+53
471 "vpaddl.u8 q0, q0 \n"
472 "vpaddl.u8 q2, q2 \n"
473
474 // d3 = 60+70 61+71 62+72 63+73
475 "vpaddl.u8 d3, d3 \n"
476 "vpaddl.u8 d7, d7 \n"
477
478 // combine source lines
479 "vadd.u16 q0, q2 \n"
480 "vadd.u16 d4, d3, d7 \n"
481
482 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
483 "vqrshrn.u16 d4, q2, #2 \n"
484
485 // Shuffle 2,3 reg around so that 2 can be added to the
486 // 0,1 reg and 3 can be added to the 4,5 reg. This
487 // requires expanding from u8 to u16 as the 0,1 and 4,5
488 // registers are already expanded. Then do transposes
489 // to get aligned.
490 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
491 "vmovl.u8 q1, d2 \n"
492 "vmovl.u8 q3, d6 \n"
493
494 // combine source lines
495 "vadd.u16 q1, q3 \n"
496
497 // d4 = xx 20 xx 30 xx 22 xx 32
498 // d5 = xx 21 xx 31 xx 23 xx 33
499 "vtrn.u32 d2, d3 \n"
500
501 // d4 = xx 20 xx 21 xx 22 xx 23
502 // d5 = xx 30 xx 31 xx 32 xx 33
503 "vtrn.u16 d2, d3 \n"
504
505 // 0+1+2, 3+4+5
506 "vadd.u16 q0, q1 \n"
507
508 // Need to divide, but can't downshift as the the value
509 // isn't a power of 2. So multiply by 65536 / n
510 // and take the upper 16 bits.
511 "vqrdmulh.s16 q0, q0, q13 \n"
512
513 // Align for table lookup, vtbl requires registers to
514 // be adjacent
515 "vmov.u8 d2, d4 \n"
516
517 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
518 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
519
520 MEMACCESS(1)
521 "vst1.8 {d3}, [%1]! \n"
522 MEMACCESS(1)
523 "vst1.32 {d4[0]}, [%1]! \n"
524 "bgt 1b \n"
525 : "+r"(src_ptr), // %0
526 "+r"(dst_ptr), // %1
527 "+r"(dst_width), // %2
528 "+r"(src_stride) // %3
529 : "r"(&kMult38_Div6), // %4
530 "r"(&kShuf38_2) // %5
531 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
532 );
533 }
534 #endif //HAS_SCALEROWDOWN38_NEON
535
536 #if 0
537 // 16x2 -> 16x1
538 void ScaleFilterRows_NEON(uint8* dst_ptr,
539 const uint8* src_ptr, ptrdiff_t src_stride,
540 int dst_width, int source_y_fraction) {
541 asm volatile (
542 "cmp %4, #0 \n"
543 "beq 100f \n"
544 "add %2, %1 \n"
545 "cmp %4, #64 \n"
546 "beq 75f \n"
547 "cmp %4, #128 \n"
548 "beq 50f \n"
549 "cmp %4, #192 \n"
550 "beq 25f \n"
551
552 "vdup.8 d5, %4 \n"
553 "rsb %4, #256 \n"
554 "vdup.8 d4, %4 \n"
555 // General purpose row blend.
556 "1: \n"
557 MEMACCESS(1)
558 "vld1.8 {q0}, [%1]! \n"
559 MEMACCESS(2)
560 "vld1.8 {q1}, [%2]! \n"
561 "subs %3, %3, #16 \n"
562 "vmull.u8 q13, d0, d4 \n"
563 "vmull.u8 q14, d1, d4 \n"
564 "vmlal.u8 q13, d2, d5 \n"
565 "vmlal.u8 q14, d3, d5 \n"
566 "vrshrn.u16 d0, q13, #8 \n"
567 "vrshrn.u16 d1, q14, #8 \n"
568 MEMACCESS(0)
569 "vst1.8 {q0}, [%0]! \n"
570 "bgt 1b \n"
571 "b 99f \n"
572
573 // Blend 25 / 75.
574 "25: \n"
575 MEMACCESS(1)
576 "vld1.8 {q0}, [%1]! \n"
577 MEMACCESS(2)
578 "vld1.8 {q1}, [%2]! \n"
579 "subs %3, %3, #16 \n"
580 "vrhadd.u8 q0, q1 \n"
581 "vrhadd.u8 q0, q1 \n"
582 MEMACCESS(0)
583 "vst1.8 {q0}, [%0]! \n"
584 "bgt 25b \n"
585 "b 99f \n"
586
587 // Blend 50 / 50.
588 "50: \n"
589 MEMACCESS(1)
590 "vld1.8 {q0}, [%1]! \n"
591 MEMACCESS(2)
592 "vld1.8 {q1}, [%2]! \n"
593 "subs %3, %3, #16 \n"
594 "vrhadd.u8 q0, q1 \n"
595 MEMACCESS(0)
596 "vst1.8 {q0}, [%0]! \n"
597 "bgt 50b \n"
598 "b 99f \n"
599
600 // Blend 75 / 25.
601 "75: \n"
602 MEMACCESS(1)
603 "vld1.8 {q1}, [%1]! \n"
604 MEMACCESS(2)
605 "vld1.8 {q0}, [%2]! \n"
606 "subs %3, %3, #16 \n"
607 "vrhadd.u8 q0, q1 \n"
608 "vrhadd.u8 q0, q1 \n"
609 MEMACCESS(0)
610 "vst1.8 {q0}, [%0]! \n"
611 "bgt 75b \n"
612 "b 99f \n"
613
614 // Blend 100 / 0 - Copy row unchanged.
615 "100: \n"
616 MEMACCESS(1)
617 "vld1.8 {q0}, [%1]! \n"
618 "subs %3, %3, #16 \n"
619 MEMACCESS(0)
620 "vst1.8 {q0}, [%0]! \n"
621 "bgt 100b \n"
622
623 "99: \n"
624 MEMACCESS(0)
625 "vst1.8 {d1[7]}, [%0] \n"
626 : "+r"(dst_ptr), // %0
627 "+r"(src_ptr), // %1
628 "+r"(src_stride), // %2
629 "+r"(dst_width), // %3
630 "+r"(source_y_fraction) // %4
631 :
632 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
633 );
634 }
635 #endif //0
636
637 #ifdef HAS_SCALEARGBROWDOWN2_NEON
638 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
639 uint8* dst, int dst_width) {
640 asm volatile (
641 ".p2align 2 \n"
642 "1: \n"
643 // load even pixels into q0, odd into q1
644 MEMACCESS(0)
645 "vld2.32 {q0, q1}, [%0]! \n"
646 MEMACCESS(0)
647 "vld2.32 {q2, q3}, [%0]! \n"
648 "subs %2, %2, #8 \n" // 8 processed per loop
649 MEMACCESS(1)
650 "vst1.8 {q1}, [%1]! \n" // store odd pixels
651 MEMACCESS(1)
652 "vst1.8 {q3}, [%1]! \n"
653 "bgt 1b \n"
654 : "+r"(src_ptr), // %0
655 "+r"(dst), // %1
656 "+r"(dst_width) // %2
657 :
658 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
659 );
660 }
661 #endif //HAS_SCALEARGBROWDOWN2_NEON
662
663 #ifdef HAS_SCALEARGBROWDOWN2_NEON
664 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
665 uint8* dst, int dst_width) {
666 asm volatile (
667 // change the stride to row 2 pointer
668 "add %1, %1, %0 \n"
669 ".p2align 2 \n"
670 "1: \n"
671 MEMACCESS(0)
672 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
673 MEMACCESS(0)
674 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
675 "subs %3, %3, #8 \n" // 8 processed per loop.
676 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
677 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
678 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
679 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
680 MEMACCESS(1)
681 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
682 MEMACCESS(1)
683 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
684 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
685 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
686 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
687 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
688 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
689 "vrshrn.u16 d1, q1, #2 \n"
690 "vrshrn.u16 d2, q2, #2 \n"
691 "vrshrn.u16 d3, q3, #2 \n"
692 MEMACCESS(2)
693 "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
694 "bgt 1b \n"
695 : "+r"(src_ptr), // %0
696 "+r"(src_stride), // %1
697 "+r"(dst), // %2
698 "+r"(dst_width) // %3
699 :
700 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
701 );
702 }
703 #endif //HAS_SCALEARGBROWDOWN2_NEON
704
705 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
706 // Reads 4 pixels at a time.
707 // Alignment requirement: src_argb 4 byte aligned.
708 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
709 int src_stepx, uint8* dst_argb, int dst_width) {
710 asm volatile (
711 "mov r12, %3, lsl #2 \n"
712 ".p2align 2 \n"
713 "1: \n"
714 MEMACCESS(0)
715 "vld1.32 {d0[0]}, [%0], r12 \n"
716 MEMACCESS(0)
717 "vld1.32 {d0[1]}, [%0], r12 \n"
718 MEMACCESS(0)
719 "vld1.32 {d1[0]}, [%0], r12 \n"
720 MEMACCESS(0)
721 "vld1.32 {d1[1]}, [%0], r12 \n"
722 "subs %2, %2, #4 \n" // 4 pixels per loop.
723 MEMACCESS(1)
724 "vst1.8 {q0}, [%1]! \n"
725 "bgt 1b \n"
726 : "+r"(src_argb), // %0
727 "+r"(dst_argb), // %1
728 "+r"(dst_width) // %2
729 : "r"(src_stepx) // %3
730 : "memory", "cc", "r12", "q0"
731 );
732 }
733 #endif //HAS_SCALEARGBROWDOWNEVEN_NEON
734
735 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
736 // Reads 4 pixels at a time.
737 // Alignment requirement: src_argb 4 byte aligned.
738 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
739 int src_stepx,
740 uint8* dst_argb, int dst_width) {
741 asm volatile (
742 "mov r12, %4, lsl #2 \n"
743 "add %1, %1, %0 \n"
744 ".p2align 2 \n"
745 "1: \n"
746 MEMACCESS(0)
747 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
748 MEMACCESS(1)
749 "vld1.8 {d1}, [%1], r12 \n"
750 MEMACCESS(0)
751 "vld1.8 {d2}, [%0], r12 \n"
752 MEMACCESS(1)
753 "vld1.8 {d3}, [%1], r12 \n"
754 MEMACCESS(0)
755 "vld1.8 {d4}, [%0], r12 \n"
756 MEMACCESS(1)
757 "vld1.8 {d5}, [%1], r12 \n"
758 MEMACCESS(0)
759 "vld1.8 {d6}, [%0], r12 \n"
760 MEMACCESS(1)
761 "vld1.8 {d7}, [%1], r12 \n"
762 "vaddl.u8 q0, d0, d1 \n"
763 "vaddl.u8 q1, d2, d3 \n"
764 "vaddl.u8 q2, d4, d5 \n"
765 "vaddl.u8 q3, d6, d7 \n"
766 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
767 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
768 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
769 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
770 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
771 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
772 "subs %3, %3, #4 \n" // 4 pixels per loop.
773 MEMACCESS(2)
774 "vst1.8 {q0}, [%2]! \n"
775 "bgt 1b \n"
776 : "+r"(src_argb), // %0
777 "+r"(src_stride), // %1
778 "+r"(dst_argb), // %2
779 "+r"(dst_width) // %3
780 : "r"(src_stepx) // %4
781 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
782 );
783 }
784 #endif // HAS_SCALEARGBROWDOWNEVEN_NEON
785 #endif // __aarch64__
786
787 #ifdef __cplusplus
788 } // extern "C"
789 } // namespace libyuv
790 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_win.cc ('k') | source/libvpx/tools_common.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698