Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(358)

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_neon64.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "libyuv/scale.h" 11 #include "libyuv/scale.h"
12 #include "libyuv/row.h" 12 #include "libyuv/row.h"
13 #include "libyuv/scale_row.h" 13 #include "libyuv/scale_row.h"
14 14
15 #ifdef __cplusplus 15 #ifdef __cplusplus
16 namespace libyuv { 16 namespace libyuv {
17 extern "C" { 17 extern "C" {
18 #endif 18 #endif
19 19
20 // This module is for GCC Neon armv8 64 bit. 20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22 22
23 // Read 32x1 throw away even pixels, and write 16x1. 23 // Read 32x1 throw away even pixels, and write 16x1.
24 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 24 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
25 uint8* dst, int dst_width) { 25 uint8* dst, int dst_width) {
26 asm volatile ( 26 asm volatile (
27 "1: \n" 27 "1: \n"
28 // load even pixels into v0, odd into v1 28 // load even pixels into v0, odd into v1
29 MEMACCESS(0) 29 MEMACCESS(0)
30 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" 30 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
31 "subs %2, %2, #16 \n" // 16 processed per loop 31 "subs %w2, %w2, #16 \n" // 16 processed per loop
32 MEMACCESS(1) 32 MEMACCESS(1)
33 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels 33 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
34 "b.gt 1b \n" 34 "b.gt 1b \n"
35 : "+r"(src_ptr), // %0 35 : "+r"(src_ptr), // %0
36 "+r"(dst), // %1 36 "+r"(dst), // %1
37 "+r"(dst_width) // %2 37 "+r"(dst_width) // %2
38 : 38 :
39 : "v0", "v1" // Clobber List 39 : "v0", "v1" // Clobber List
40 ); 40 );
41 } 41 }
42 42
43 // Read 32x1 average down and write 16x1.
44 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
45 uint8* dst, int dst_width) {
46 asm volatile (
47 "1: \n"
48 MEMACCESS(0)
49 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
50 "subs %w2, %w2, #16 \n" // 16 processed per loop
51 "uaddlp v0.8h, v0.16b \n" // add adjacent
52 "uaddlp v1.8h, v1.16b \n"
53 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
54 "rshrn2 v0.16b, v1.8h, #1 \n"
55 MEMACCESS(1)
56 "st1 {v0.16b}, [%1], #16 \n"
57 "b.gt 1b \n"
58 : "+r"(src_ptr), // %0
59 "+r"(dst), // %1
60 "+r"(dst_width) // %2
61 :
62 : "v0", "v1" // Clobber List
63 );
64 }
65
43 // Read 32x2 average down and write 16x1. 66 // Read 32x2 average down and write 16x1.
44 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 67 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
45 uint8* dst, int dst_width) { 68 uint8* dst, int dst_width) {
46 asm volatile ( 69 asm volatile (
47 // change the stride to row 2 pointer 70 // change the stride to row 2 pointer
48 "add %1, %1, %0 \n" 71 "add %1, %1, %0 \n"
49 "1: \n" 72 "1: \n"
50 MEMACCESS(0) 73 MEMACCESS(0)
51 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc 74 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
52 MEMACCESS(1) 75 MEMACCESS(1)
53 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc 76 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
54 "subs %3, %3, #16 \n" // 16 processed per loop 77 "subs %w3, %w3, #16 \n" // 16 processed per loop
55 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent 78 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
56 "uaddlp v1.8h, v1.16b \n" 79 "uaddlp v1.8h, v1.16b \n"
57 "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 80 "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
58 "uadalp v1.8h, v3.16b \n" 81 "uadalp v1.8h, v3.16b \n"
59 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack 82 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
60 "rshrn2 v0.16b, v1.8h, #2 \n" 83 "rshrn2 v0.16b, v1.8h, #2 \n"
61 MEMACCESS(2) 84 MEMACCESS(2)
62 "st1 {v0.16b}, [%2], #16 \n" 85 "st1 {v0.16b}, [%2], #16 \n"
63 "b.gt 1b \n" 86 "b.gt 1b \n"
64 : "+r"(src_ptr), // %0 87 : "+r"(src_ptr), // %0
65 "+r"(src_stride), // %1 88 "+r"(src_stride), // %1
66 "+r"(dst), // %2 89 "+r"(dst), // %2
67 "+r"(dst_width) // %3 90 "+r"(dst_width) // %3
68 : 91 :
69 : "v0", "v1", "v2", "v3" // Clobber List 92 : "v0", "v1", "v2", "v3" // Clobber List
70 ); 93 );
71 } 94 }
72 95
73 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 96 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
74 uint8* dst_ptr, int dst_width) { 97 uint8* dst_ptr, int dst_width) {
75 asm volatile ( 98 asm volatile (
76 "1: \n" 99 "1: \n"
77 MEMACCESS(0) 100 MEMACCESS(0)
78 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 101 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
79 "subs %2, %2, #8 \n" // 8 processed per loop 102 "subs %w2, %w2, #8 \n" // 8 processed per loop
80 MEMACCESS(1) 103 MEMACCESS(1)
81 "st1 {v2.8b}, [%1], #8 \n" 104 "st1 {v2.8b}, [%1], #8 \n"
82 "b.gt 1b \n" 105 "b.gt 1b \n"
83 : "+r"(src_ptr), // %0 106 : "+r"(src_ptr), // %0
84 "+r"(dst_ptr), // %1 107 "+r"(dst_ptr), // %1
85 "+r"(dst_width) // %2 108 "+r"(dst_width) // %2
86 : 109 :
87 : "v0", "v1", "v2", "v3", "memory", "cc" 110 : "v0", "v1", "v2", "v3", "memory", "cc"
88 ); 111 );
89 } 112 }
90 113
91 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 114 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
92 uint8* dst_ptr, int dst_width) { 115 uint8* dst_ptr, int dst_width) {
93 const uint8* src_ptr1 = src_ptr + src_stride; 116 const uint8* src_ptr1 = src_ptr + src_stride;
94 const uint8* src_ptr2 = src_ptr + src_stride * 2; 117 const uint8* src_ptr2 = src_ptr + src_stride * 2;
95 const uint8* src_ptr3 = src_ptr + src_stride * 3; 118 const uint8* src_ptr3 = src_ptr + src_stride * 3;
96 asm volatile ( 119 asm volatile (
97 "1: \n" 120 "1: \n"
98 MEMACCESS(0) 121 MEMACCESS(0)
99 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 122 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
100 MEMACCESS(3) 123 MEMACCESS(3)
101 "ld1 {v1.16b}, [%2], #16 \n" 124 "ld1 {v1.16b}, [%2], #16 \n"
102 MEMACCESS(4) 125 MEMACCESS(4)
103 "ld1 {v2.16b}, [%3], #16 \n" 126 "ld1 {v2.16b}, [%3], #16 \n"
104 MEMACCESS(5) 127 MEMACCESS(5)
105 "ld1 {v3.16b}, [%4], #16 \n" 128 "ld1 {v3.16b}, [%4], #16 \n"
106 "subs %5, %5, #4 \n" 129 "subs %w5, %w5, #4 \n"
107 "uaddlp v0.8h, v0.16b \n" 130 "uaddlp v0.8h, v0.16b \n"
108 "uadalp v0.8h, v1.16b \n" 131 "uadalp v0.8h, v1.16b \n"
109 "uadalp v0.8h, v2.16b \n" 132 "uadalp v0.8h, v2.16b \n"
110 "uadalp v0.8h, v3.16b \n" 133 "uadalp v0.8h, v3.16b \n"
111 "addp v0.8h, v0.8h, v0.8h \n" 134 "addp v0.8h, v0.8h, v0.8h \n"
112 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding 135 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
113 MEMACCESS(1) 136 MEMACCESS(1)
114 "st1 {v0.s}[0], [%1], #4 \n" 137 "st1 {v0.s}[0], [%1], #4 \n"
115 "b.gt 1b \n" 138 "b.gt 1b \n"
116 : "+r"(src_ptr), // %0 139 : "+r"(src_ptr), // %0
(...skipping 10 matching lines...) Expand all
127 // Down scale from 4 to 3 pixels. Use the neon multilane read/write 150 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
128 // to load up the every 4th pixel into a 4 different registers. 151 // to load up the every 4th pixel into a 4 different registers.
129 // Point samples 32 pixels to 24 pixels. 152 // Point samples 32 pixels to 24 pixels.
130 void ScaleRowDown34_NEON(const uint8* src_ptr, 153 void ScaleRowDown34_NEON(const uint8* src_ptr,
131 ptrdiff_t src_stride, 154 ptrdiff_t src_stride,
132 uint8* dst_ptr, int dst_width) { 155 uint8* dst_ptr, int dst_width) {
133 asm volatile ( 156 asm volatile (
134 "1: \n" 157 "1: \n"
135 MEMACCESS(0) 158 MEMACCESS(0)
136 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l ine 0 159 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l ine 0
137 "subs %2, %2, #24 \n" 160 "subs %w2, %w2, #24 \n"
138 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 161 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
139 MEMACCESS(1) 162 MEMACCESS(1)
140 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 163 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
141 "b.gt 1b \n" 164 "b.gt 1b \n"
142 : "+r"(src_ptr), // %0 165 : "+r"(src_ptr), // %0
143 "+r"(dst_ptr), // %1 166 "+r"(dst_ptr), // %1
144 "+r"(dst_width) // %2 167 "+r"(dst_width) // %2
145 : 168 :
146 : "v0", "v1", "v2", "v3", "memory", "cc" 169 : "v0", "v1", "v2", "v3", "memory", "cc"
147 ); 170 );
148 } 171 }
149 172
150 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, 173 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
151 ptrdiff_t src_stride, 174 ptrdiff_t src_stride,
152 uint8* dst_ptr, int dst_width) { 175 uint8* dst_ptr, int dst_width) {
153 asm volatile ( 176 asm volatile (
154 "movi v20.8b, #3 \n" 177 "movi v20.8b, #3 \n"
155 "add %3, %3, %0 \n" 178 "add %3, %3, %0 \n"
156 "1: \n" 179 "1: \n"
157 MEMACCESS(0) 180 MEMACCESS(0)
158 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l ine 0 181 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l ine 0
159 MEMACCESS(3) 182 MEMACCESS(3)
160 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src l ine 1 183 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src l ine 1
161 "subs %2, %2, #24 \n" 184 "subs %w2, %w2, #24 \n"
162 185
163 // filter src line 0 with src line 1 186 // filter src line 0 with src line 1
164 // expand chars to shorts to allow for room 187 // expand chars to shorts to allow for room
165 // when adding lines together 188 // when adding lines together
166 "ushll v16.8h, v4.8b, #0 \n" 189 "ushll v16.8h, v4.8b, #0 \n"
167 "ushll v17.8h, v5.8b, #0 \n" 190 "ushll v17.8h, v5.8b, #0 \n"
168 "ushll v18.8h, v6.8b, #0 \n" 191 "ushll v18.8h, v6.8b, #0 \n"
169 "ushll v19.8h, v7.8b, #0 \n" 192 "ushll v19.8h, v7.8b, #0 \n"
170 193
171 // 3 * line_0 + line_1 194 // 3 * line_0 + line_1
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
211 ptrdiff_t src_stride, 234 ptrdiff_t src_stride,
212 uint8* dst_ptr, int dst_width) { 235 uint8* dst_ptr, int dst_width) {
213 asm volatile ( 236 asm volatile (
214 "movi v20.8b, #3 \n" 237 "movi v20.8b, #3 \n"
215 "add %3, %3, %0 \n" 238 "add %3, %3, %0 \n"
216 "1: \n" 239 "1: \n"
217 MEMACCESS(0) 240 MEMACCESS(0)
218 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l ine 0 241 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l ine 0
219 MEMACCESS(3) 242 MEMACCESS(3)
220 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src l ine 1 243 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src l ine 1
221 "subs %2, %2, #24 \n" 244 "subs %w2, %w2, #24 \n"
222 // average src line 0 with src line 1 245 // average src line 0 with src line 1
223 "urhadd v0.8b, v0.8b, v4.8b \n" 246 "urhadd v0.8b, v0.8b, v4.8b \n"
224 "urhadd v1.8b, v1.8b, v5.8b \n" 247 "urhadd v1.8b, v1.8b, v5.8b \n"
225 "urhadd v2.8b, v2.8b, v6.8b \n" 248 "urhadd v2.8b, v2.8b, v6.8b \n"
226 "urhadd v3.8b, v3.8b, v7.8b \n" 249 "urhadd v3.8b, v3.8b, v7.8b \n"
227 250
228 // a0 = (src[0] * 3 + s[1] * 1) >> 2 251 // a0 = (src[0] * 3 + s[1] * 1) >> 2
229 "ushll v4.8h, v1.8b, #0 \n" 252 "ushll v4.8h, v1.8b, #0 \n"
230 "umlal v4.8h, v0.8b, v20.8b \n" 253 "umlal v4.8h, v0.8b, v20.8b \n"
231 "uqrshrn v0.8b, v4.8h, #2 \n" 254 "uqrshrn v0.8b, v4.8h, #2 \n"
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
264 // 32 -> 12 287 // 32 -> 12
265 void ScaleRowDown38_NEON(const uint8* src_ptr, 288 void ScaleRowDown38_NEON(const uint8* src_ptr,
266 ptrdiff_t src_stride, 289 ptrdiff_t src_stride,
267 uint8* dst_ptr, int dst_width) { 290 uint8* dst_ptr, int dst_width) {
268 asm volatile ( 291 asm volatile (
269 MEMACCESS(3) 292 MEMACCESS(3)
270 "ld1 {v3.16b}, [%3] \n" 293 "ld1 {v3.16b}, [%3] \n"
271 "1: \n" 294 "1: \n"
272 MEMACCESS(0) 295 MEMACCESS(0)
273 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" 296 "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
274 "subs %2, %2, #12 \n" 297 "subs %w2, %w2, #12 \n"
275 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" 298 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
276 MEMACCESS(1) 299 MEMACCESS(1)
277 "st1 {v2.8b}, [%1], #8 \n" 300 "st1 {v2.8b}, [%1], #8 \n"
278 MEMACCESS(1) 301 MEMACCESS(1)
279 "st1 {v2.s}[2], [%1], #4 \n" 302 "st1 {v2.s}[2], [%1], #4 \n"
280 "b.gt 1b \n" 303 "b.gt 1b \n"
281 : "+r"(src_ptr), // %0 304 : "+r"(src_ptr), // %0
282 "+r"(dst_ptr), // %1 305 "+r"(dst_ptr), // %1
283 "+r"(dst_width) // %2 306 "+r"(dst_width) // %2
284 : "r"(&kShuf38) // %3 307 : "r"(&kShuf38) // %3
(...skipping 21 matching lines...) Expand all
306 // 00 40 01 41 02 42 03 43 329 // 00 40 01 41 02 42 03 43
307 // 10 50 11 51 12 52 13 53 330 // 10 50 11 51 12 52 13 53
308 // 20 60 21 61 22 62 23 63 331 // 20 60 21 61 22 62 23 63
309 // 30 70 31 71 32 72 33 73 332 // 30 70 31 71 32 72 33 73
310 MEMACCESS(0) 333 MEMACCESS(0)
311 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 334 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
312 MEMACCESS(3) 335 MEMACCESS(3)
313 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 336 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
314 MEMACCESS(4) 337 MEMACCESS(4)
315 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" 338 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
316 "subs %4, %4, #12 \n" 339 "subs %w4, %w4, #12 \n"
317 340
318 // Shuffle the input data around to get align the data 341 // Shuffle the input data around to get align the data
319 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 342 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
320 // 00 10 01 11 02 12 03 13 343 // 00 10 01 11 02 12 03 13
321 // 40 50 41 51 42 52 43 53 344 // 40 50 41 51 42 52 43 53
322 "trn1 v20.8b, v0.8b, v1.8b \n" 345 "trn1 v20.8b, v0.8b, v1.8b \n"
323 "trn2 v21.8b, v0.8b, v1.8b \n" 346 "trn2 v21.8b, v0.8b, v1.8b \n"
324 "trn1 v22.8b, v4.8b, v5.8b \n" 347 "trn1 v22.8b, v4.8b, v5.8b \n"
325 "trn2 v23.8b, v4.8b, v5.8b \n" 348 "trn2 v23.8b, v4.8b, v5.8b \n"
326 "trn1 v24.8b, v16.8b, v17.8b \n" 349 "trn1 v24.8b, v16.8b, v17.8b \n"
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after
430 "1: \n" 453 "1: \n"
431 454
432 // 00 40 01 41 02 42 03 43 455 // 00 40 01 41 02 42 03 43
433 // 10 50 11 51 12 52 13 53 456 // 10 50 11 51 12 52 13 53
434 // 20 60 21 61 22 62 23 63 457 // 20 60 21 61 22 62 23 63
435 // 30 70 31 71 32 72 33 73 458 // 30 70 31 71 32 72 33 73
436 MEMACCESS(0) 459 MEMACCESS(0)
437 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 460 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
438 MEMACCESS(3) 461 MEMACCESS(3)
439 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 462 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
440 "subs %3, %3, #12 \n" 463 "subs %w3, %w3, #12 \n"
441 464
442 // Shuffle the input data around to get align the data 465 // Shuffle the input data around to get align the data
443 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 466 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
444 // 00 10 01 11 02 12 03 13 467 // 00 10 01 11 02 12 03 13
445 // 40 50 41 51 42 52 43 53 468 // 40 50 41 51 42 52 43 53
446 "trn1 v16.8b, v0.8b, v1.8b \n" 469 "trn1 v16.8b, v0.8b, v1.8b \n"
447 "trn2 v17.8b, v0.8b, v1.8b \n" 470 "trn2 v17.8b, v0.8b, v1.8b \n"
448 "trn1 v18.8b, v4.8b, v5.8b \n" 471 "trn1 v18.8b, v4.8b, v5.8b \n"
449 "trn2 v19.8b, v4.8b, v5.8b \n" 472 "trn2 v19.8b, v4.8b, v5.8b \n"
450 473
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
515 "+r"(dst_ptr), // %1 538 "+r"(dst_ptr), // %1
516 "+r"(tmp_src_stride), // %2 539 "+r"(tmp_src_stride), // %2
517 "+r"(dst_width) // %3 540 "+r"(dst_width) // %3
518 : "r"(&kMult38_Div6), // %4 541 : "r"(&kMult38_Div6), // %4
519 "r"(&kShuf38_2) // %5 542 "r"(&kShuf38_2) // %5
520 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
521 "v18", "v19", "v30", "v31", "memory", "cc" 544 "v18", "v19", "v30", "v31", "memory", "cc"
522 ); 545 );
523 } 546 }
524 547
548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
549 uint16* dst_ptr, int src_width, int src_height) {
550 const uint8* src_tmp = NULL;
551 asm volatile (
552 "1: \n"
553 "mov %0, %1 \n"
554 "mov w12, %w5 \n"
555 "eor v2.16b, v2.16b, v2.16b \n"
556 "eor v3.16b, v3.16b, v3.16b \n"
557 "2: \n"
558 // load 16 pixels into q0
559 MEMACCESS(0)
560 "ld1 {v0.16b}, [%0], %3 \n"
561 "uaddw2 v3.8h, v3.8h, v0.16b \n"
562 "uaddw v2.8h, v2.8h, v0.8b \n"
563 "subs w12, w12, #1 \n"
564 "b.gt 2b \n"
565 MEMACCESS(2)
566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
567 "add %1, %1, #16 \n"
568 "subs %w4, %w4, #16 \n" // 16 processed per loop
569 "b.gt 1b \n"
570 : "+r"(src_tmp), // %0
571 "+r"(src_ptr), // %1
572 "+r"(dst_ptr), // %2
573 "+r"(src_stride), // %3
574 "+r"(src_width), // %4
575 "+r"(src_height) // %5
576 :
577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
578 );
579 }
580
581 // TODO(Yang Zhang): Investigate less load instructions for
582 // the x/dx stepping
583 #define LOAD2_DATA8_LANE(n) \
584 "lsr %5, %3, #16 \n" \
585 "add %6, %1, %5 \n" \
586 "add %3, %3, %4 \n" \
587 MEMACCESS(6) \
588 "ld2 {v4.b, v5.b}["#n"], [%6] \n"
589
590 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
591 int dst_width, int x, int dx) {
592 int dx_offset[4] = {0, 1, 2, 3};
593 int* tmp = dx_offset;
594 const uint8* src_tmp = src_ptr;
595 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
596 int64 x64 = (int64) x;
597 int64 dx64 = (int64) dx;
598 asm volatile (
599 "dup v0.4s, %w3 \n" // x
600 "dup v1.4s, %w4 \n" // dx
601 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
602 "shl v3.4s, v1.4s, #2 \n" // 4 * dx
603 "mul v1.4s, v1.4s, v2.4s \n"
604 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
605 "add v1.4s, v1.4s, v0.4s \n"
606 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
607 "add v2.4s, v1.4s, v3.4s \n"
608 "shl v0.4s, v3.4s, #1 \n" // 8 * dx
609 "1: \n"
610 LOAD2_DATA8_LANE(0)
611 LOAD2_DATA8_LANE(1)
612 LOAD2_DATA8_LANE(2)
613 LOAD2_DATA8_LANE(3)
614 LOAD2_DATA8_LANE(4)
615 LOAD2_DATA8_LANE(5)
616 LOAD2_DATA8_LANE(6)
617 LOAD2_DATA8_LANE(7)
618 "mov v6.16b, v1.16b \n"
619 "mov v7.16b, v2.16b \n"
620 "uzp1 v6.8h, v6.8h, v7.8h \n"
621 "ushll v4.8h, v4.8b, #0 \n"
622 "ushll v5.8h, v5.8b, #0 \n"
623 "ssubl v16.4s, v5.4h, v4.4h \n"
624 "ssubl2 v17.4s, v5.8h, v4.8h \n"
625 "ushll v7.4s, v6.4h, #0 \n"
626 "ushll2 v6.4s, v6.8h, #0 \n"
627 "mul v16.4s, v16.4s, v7.4s \n"
628 "mul v17.4s, v17.4s, v6.4s \n"
629 "shrn v6.4h, v16.4s, #16 \n"
630 "shrn2 v6.8h, v17.4s, #16 \n"
631 "add v4.8h, v4.8h, v6.8h \n"
632 "xtn v4.8b, v4.8h \n"
633
634 MEMACCESS(0)
635 "st1 {v4.8b}, [%0], #8 \n" // store pixels
636 "add v1.4s, v1.4s, v0.4s \n"
637 "add v2.4s, v2.4s, v0.4s \n"
638 "subs %w2, %w2, #8 \n" // 8 processed per loop
639 "b.gt 1b \n"
640 : "+r"(dst_ptr), // %0
641 "+r"(src_ptr), // %1
642 "+r"(dst_width64), // %2
643 "+r"(x64), // %3
644 "+r"(dx64), // %4
645 "+r"(tmp), // %5
646 "+r"(src_tmp) // %6
647 :
648 : "memory", "cc", "v0", "v1", "v2", "v3",
649 "v4", "v5", "v6", "v7", "v16", "v17"
650 );
651 }
652
653 #undef LOAD2_DATA8_LANE
654
525 // 16x2 -> 16x1 655 // 16x2 -> 16x1
526 void ScaleFilterRows_NEON(uint8* dst_ptr, 656 void ScaleFilterRows_NEON(uint8* dst_ptr,
527 const uint8* src_ptr, ptrdiff_t src_stride, 657 const uint8* src_ptr, ptrdiff_t src_stride,
528 int dst_width, int source_y_fraction) { 658 int dst_width, int source_y_fraction) {
529 int y_fraction = 256 - source_y_fraction; 659 int y_fraction = 256 - source_y_fraction;
530 asm volatile ( 660 asm volatile (
531 "cmp %4, #0 \n" 661 "cmp %w4, #0 \n"
532 "b.eq 100f \n" 662 "b.eq 100f \n"
533 "add %2, %2, %1 \n" 663 "add %2, %2, %1 \n"
534 "cmp %4, #64 \n" 664 "cmp %w4, #64 \n"
535 "b.eq 75f \n" 665 "b.eq 75f \n"
536 "cmp %4, #128 \n" 666 "cmp %w4, #128 \n"
537 "b.eq 50f \n" 667 "b.eq 50f \n"
538 "cmp %4, #192 \n" 668 "cmp %w4, #192 \n"
539 "b.eq 25f \n" 669 "b.eq 25f \n"
540 670
541 "dup v5.8b, %w4 \n" 671 "dup v5.8b, %w4 \n"
542 "dup v4.8b, %w5 \n" 672 "dup v4.8b, %w5 \n"
543 // General purpose row blend. 673 // General purpose row blend.
544 "1: \n" 674 "1: \n"
545 MEMACCESS(1) 675 MEMACCESS(1)
546 "ld1 {v0.16b}, [%1], #16 \n" 676 "ld1 {v0.16b}, [%1], #16 \n"
547 MEMACCESS(2) 677 MEMACCESS(2)
548 "ld1 {v1.16b}, [%2], #16 \n" 678 "ld1 {v1.16b}, [%2], #16 \n"
549 "subs %3, %3, #16 \n" 679 "subs %w3, %w3, #16 \n"
550 "umull v6.8h, v0.8b, v4.8b \n" 680 "umull v6.8h, v0.8b, v4.8b \n"
551 "umull2 v7.8h, v0.16b, v4.16b \n" 681 "umull2 v7.8h, v0.16b, v4.16b \n"
552 "umlal v6.8h, v1.8b, v5.8b \n" 682 "umlal v6.8h, v1.8b, v5.8b \n"
553 "umlal2 v7.8h, v1.16b, v5.16b \n" 683 "umlal2 v7.8h, v1.16b, v5.16b \n"
554 "rshrn v0.8b, v6.8h, #8 \n" 684 "rshrn v0.8b, v6.8h, #8 \n"
555 "rshrn2 v0.16b, v7.8h, #8 \n" 685 "rshrn2 v0.16b, v7.8h, #8 \n"
556 MEMACCESS(0) 686 MEMACCESS(0)
557 "st1 {v0.16b}, [%0], #16 \n" 687 "st1 {v0.16b}, [%0], #16 \n"
558 "b.gt 1b \n" 688 "b.gt 1b \n"
559 "b 99f \n" 689 "b 99f \n"
560 690
561 // Blend 25 / 75. 691 // Blend 25 / 75.
562 "25: \n" 692 "25: \n"
563 MEMACCESS(1) 693 MEMACCESS(1)
564 "ld1 {v0.16b}, [%1], #16 \n" 694 "ld1 {v0.16b}, [%1], #16 \n"
565 MEMACCESS(2) 695 MEMACCESS(2)
566 "ld1 {v1.16b}, [%2], #16 \n" 696 "ld1 {v1.16b}, [%2], #16 \n"
567 "subs %3, %3, #16 \n" 697 "subs %w3, %w3, #16 \n"
568 "urhadd v0.16b, v0.16b, v1.16b \n" 698 "urhadd v0.16b, v0.16b, v1.16b \n"
569 "urhadd v0.16b, v0.16b, v1.16b \n" 699 "urhadd v0.16b, v0.16b, v1.16b \n"
570 MEMACCESS(0) 700 MEMACCESS(0)
571 "st1 {v0.16b}, [%0], #16 \n" 701 "st1 {v0.16b}, [%0], #16 \n"
572 "b.gt 25b \n" 702 "b.gt 25b \n"
573 "b 99f \n" 703 "b 99f \n"
574 704
575 // Blend 50 / 50. 705 // Blend 50 / 50.
576 "50: \n" 706 "50: \n"
577 MEMACCESS(1) 707 MEMACCESS(1)
578 "ld1 {v0.16b}, [%1], #16 \n" 708 "ld1 {v0.16b}, [%1], #16 \n"
579 MEMACCESS(2) 709 MEMACCESS(2)
580 "ld1 {v1.16b}, [%2], #16 \n" 710 "ld1 {v1.16b}, [%2], #16 \n"
581 "subs %3, %3, #16 \n" 711 "subs %w3, %w3, #16 \n"
582 "urhadd v0.16b, v0.16b, v1.16b \n" 712 "urhadd v0.16b, v0.16b, v1.16b \n"
583 MEMACCESS(0) 713 MEMACCESS(0)
584 "st1 {v0.16b}, [%0], #16 \n" 714 "st1 {v0.16b}, [%0], #16 \n"
585 "b.gt 50b \n" 715 "b.gt 50b \n"
586 "b 99f \n" 716 "b 99f \n"
587 717
588 // Blend 75 / 25. 718 // Blend 75 / 25.
589 "75: \n" 719 "75: \n"
590 MEMACCESS(1) 720 MEMACCESS(1)
591 "ld1 {v1.16b}, [%1], #16 \n" 721 "ld1 {v1.16b}, [%1], #16 \n"
592 MEMACCESS(2) 722 MEMACCESS(2)
593 "ld1 {v0.16b}, [%2], #16 \n" 723 "ld1 {v0.16b}, [%2], #16 \n"
594 "subs %3, %3, #16 \n" 724 "subs %w3, %w3, #16 \n"
595 "urhadd v0.16b, v0.16b, v1.16b \n" 725 "urhadd v0.16b, v0.16b, v1.16b \n"
596 "urhadd v0.16b, v0.16b, v1.16b \n" 726 "urhadd v0.16b, v0.16b, v1.16b \n"
597 MEMACCESS(0) 727 MEMACCESS(0)
598 "st1 {v0.16b}, [%0], #16 \n" 728 "st1 {v0.16b}, [%0], #16 \n"
599 "b.gt 75b \n" 729 "b.gt 75b \n"
600 "b 99f \n" 730 "b 99f \n"
601 731
602 // Blend 100 / 0 - Copy row unchanged. 732 // Blend 100 / 0 - Copy row unchanged.
603 "100: \n" 733 "100: \n"
604 MEMACCESS(1) 734 MEMACCESS(1)
605 "ld1 {v0.16b}, [%1], #16 \n" 735 "ld1 {v0.16b}, [%1], #16 \n"
606 "subs %3, %3, #16 \n" 736 "subs %w3, %w3, #16 \n"
607 MEMACCESS(0) 737 MEMACCESS(0)
608 "st1 {v0.16b}, [%0], #16 \n" 738 "st1 {v0.16b}, [%0], #16 \n"
609 "b.gt 100b \n" 739 "b.gt 100b \n"
610 740
611 "99: \n" 741 "99: \n"
612 MEMACCESS(0) 742 MEMACCESS(0)
613 "st1 {v0.b}[15], [%0] \n" 743 "st1 {v0.b}[15], [%0] \n"
614 : "+r"(dst_ptr), // %0 744 : "+r"(dst_ptr), // %0
615 "+r"(src_ptr), // %1 745 "+r"(src_ptr), // %1
616 "+r"(src_stride), // %2 746 "+r"(src_stride), // %2
617 "+r"(dst_width), // %3 747 "+r"(dst_width), // %3
618 "+r"(source_y_fraction),// %4 748 "+r"(source_y_fraction),// %4
619 "+r"(y_fraction) // %5 749 "+r"(y_fraction) // %5
620 : 750 :
621 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" 751 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
622 ); 752 );
623 } 753 }
624 754
625 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 755 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
626 uint8* dst, int dst_width) { 756 uint8* dst, int dst_width) {
627 asm volatile ( 757 asm volatile (
628 "1: \n" 758 "1: \n"
629 // load even pixels into q0, odd into q1 759 // load even pixels into q0, odd into q1
630 MEMACCESS (0) 760 MEMACCESS (0)
631 "ld2 {v0.4s, v1.4s}, [%0], #32 \n" 761 "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
632 MEMACCESS (0) 762 MEMACCESS (0)
633 "ld2 {v2.4s, v3.4s}, [%0], #32 \n" 763 "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
634 "subs %2, %2, #8 \n" // 8 processed per loop 764 "subs %w2, %w2, #8 \n" // 8 processed per loop
635 MEMACCESS (1) 765 MEMACCESS (1)
636 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels 766 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
637 MEMACCESS (1) 767 MEMACCESS (1)
638 "st1 {v3.16b}, [%1], #16 \n" 768 "st1 {v3.16b}, [%1], #16 \n"
639 "b.gt 1b \n" 769 "b.gt 1b \n"
640 : "+r" (src_ptr), // %0 770 : "+r" (src_ptr), // %0
641 "+r" (dst), // %1 771 "+r" (dst), // %1
642 "+r" (dst_width) // %2 772 "+r" (dst_width) // %2
643 : 773 :
644 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 774 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
645 ); 775 );
646 } 776 }
647 777
778 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
779 uint8* dst_argb, int dst_width) {
780 asm volatile (
781 "1: \n"
782 MEMACCESS (0)
783 // load 8 ARGB pixels.
784 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
785 "subs %w2, %w2, #8 \n" // 8 processed per loop.
786 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
787 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
788 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
789 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
790 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
791 "rshrn v1.8b, v1.8h, #1 \n"
792 "rshrn v2.8b, v2.8h, #1 \n"
793 "rshrn v3.8b, v3.8h, #1 \n"
794 MEMACCESS (1)
795 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
796 "b.gt 1b \n"
797 : "+r"(src_argb), // %0
798 "+r"(dst_argb), // %1
799 "+r"(dst_width) // %2
800 :
801 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
802 );
803 }
804
648 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 805 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
649 uint8* dst, int dst_width) { 806 uint8* dst, int dst_width) {
650 asm volatile ( 807 asm volatile (
651 // change the stride to row 2 pointer 808 // change the stride to row 2 pointer
652 "add %1, %1, %0 \n" 809 "add %1, %1, %0 \n"
653 "1: \n" 810 "1: \n"
654 MEMACCESS (0) 811 MEMACCESS (0)
655 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB p ixels. 812 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB p ixels.
656 "subs %3, %3, #8 \n" // 8 processed per loop. 813 "subs %w3, %w3, #8 \n" // 8 processed per loop.
657 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 814 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
658 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 815 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
659 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 816 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
660 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. 817 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
661 MEMACCESS (1) 818 MEMACCESS (1)
662 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. 819 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
663 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. 820 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
664 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. 821 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
665 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. 822 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
666 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. 823 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
(...skipping 20 matching lines...) Expand all
687 asm volatile ( 844 asm volatile (
688 "1: \n" 845 "1: \n"
689 MEMACCESS(0) 846 MEMACCESS(0)
690 "ld1 {v0.s}[0], [%0], %3 \n" 847 "ld1 {v0.s}[0], [%0], %3 \n"
691 MEMACCESS(0) 848 MEMACCESS(0)
692 "ld1 {v0.s}[1], [%0], %3 \n" 849 "ld1 {v0.s}[1], [%0], %3 \n"
693 MEMACCESS(0) 850 MEMACCESS(0)
694 "ld1 {v0.s}[2], [%0], %3 \n" 851 "ld1 {v0.s}[2], [%0], %3 \n"
695 MEMACCESS(0) 852 MEMACCESS(0)
696 "ld1 {v0.s}[3], [%0], %3 \n" 853 "ld1 {v0.s}[3], [%0], %3 \n"
697 "subs %2, %2, #4 \n" // 4 pixels per loop. 854 "subs %w2, %w2, #4 \n" // 4 pixels per loop.
698 MEMACCESS(1) 855 MEMACCESS(1)
699 "st1 {v0.16b}, [%1], #16 \n" 856 "st1 {v0.16b}, [%1], #16 \n"
700 "b.gt 1b \n" 857 "b.gt 1b \n"
701 : "+r"(src_argb), // %0 858 : "+r"(src_argb), // %0
702 "+r"(dst_argb), // %1 859 "+r"(dst_argb), // %1
703 "+r"(dst_width) // %2 860 "+r"(dst_width) // %2
704 : "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3 861 : "r"((int64)(src_stepx * 4)) // %3
705 : "memory", "cc", "v0" 862 : "memory", "cc", "v0"
706 ); 863 );
707 } 864 }
708 865
709 // Reads 4 pixels at a time. 866 // Reads 4 pixels at a time.
710 // Alignment requirement: src_argb 4 byte aligned. 867 // Alignment requirement: src_argb 4 byte aligned.
711 // TODO, might be worth another optimization pass in future. 868 // TODO(Yang Zhang): Might be worth another optimization pass in future.
712 // It could be upgraded to 8 pixels at a time to start with. 869 // It could be upgraded to 8 pixels at a time to start with.
713 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, 870 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
714 int src_stepx, 871 int src_stepx,
715 uint8* dst_argb, int dst_width) { 872 uint8* dst_argb, int dst_width) {
716 asm volatile ( 873 asm volatile (
717 "add %1, %1, %0 \n" 874 "add %1, %1, %0 \n"
718 "1: \n" 875 "1: \n"
719 MEMACCESS(0) 876 MEMACCESS(0)
720 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 877 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
721 MEMACCESS(1) 878 MEMACCESS(1)
722 "ld1 {v1.8b}, [%1], %4 \n" 879 "ld1 {v1.8b}, [%1], %4 \n"
723 MEMACCESS(0) 880 MEMACCESS(0)
724 "ld1 {v2.8b}, [%0], %4 \n" 881 "ld1 {v2.8b}, [%0], %4 \n"
725 MEMACCESS(1) 882 MEMACCESS(1)
726 "ld1 {v3.8b}, [%1], %4 \n" 883 "ld1 {v3.8b}, [%1], %4 \n"
727 MEMACCESS(0) 884 MEMACCESS(0)
728 "ld1 {v4.8b}, [%0], %4 \n" 885 "ld1 {v4.8b}, [%0], %4 \n"
729 MEMACCESS(1) 886 MEMACCESS(1)
730 "ld1 {v5.8b}, [%1], %4 \n" 887 "ld1 {v5.8b}, [%1], %4 \n"
731 MEMACCESS(0) 888 MEMACCESS(0)
732 "ld1 {v6.8b}, [%0], %4 \n" 889 "ld1 {v6.8b}, [%0], %4 \n"
733 MEMACCESS(1) 890 MEMACCESS(1)
734 "ld1 {v7.8b}, [%1], %4 \n" 891 "ld1 {v7.8b}, [%1], %4 \n"
735 "uaddl v0.8h, v0.8b, v1.8b \n" 892 "uaddl v0.8h, v0.8b, v1.8b \n"
736 "uaddl v2.8h, v2.8b, v3.8b \n" 893 "uaddl v2.8h, v2.8b, v3.8b \n"
737 "uaddl v4.8h, v4.8b, v5.8b \n" 894 "uaddl v4.8h, v4.8b, v5.8b \n"
738 "uaddl v6.8h, v6.8b, v7.8b \n" 895 "uaddl v6.8h, v6.8b, v7.8b \n"
739 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd 896 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
740 "mov v0.d[1], v2.d[0] \n" 897 "mov v0.d[1], v2.d[0] \n"
741 "mov v2.d[0], v16.d[1] \n" 898 "mov v2.d[0], v16.d[1] \n"
742 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh 899 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
743 "mov v4.d[1], v6.d[0] \n" 900 "mov v4.d[1], v6.d[0] \n"
744 "mov v6.d[0], v16.d[1] \n" 901 "mov v6.d[0], v16.d[1] \n"
745 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) 902 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
746 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) 903 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
747 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. 904 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
748 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. 905 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
749 "subs %3, %3, #4 \n" // 4 pixels per loop. 906 "subs %w3, %w3, #4 \n" // 4 pixels per loop.
750 MEMACCESS(2) 907 MEMACCESS(2)
751 "st1 {v0.16b}, [%2], #16 \n" 908 "st1 {v0.16b}, [%2], #16 \n"
752 "b.gt 1b \n" 909 "b.gt 1b \n"
753 : "+r"(src_argb), // %0 910 : "+r"(src_argb), // %0
754 "+r"(src_stride), // %1 911 "+r"(src_stride), // %1
755 "+r"(dst_argb), // %2 912 "+r"(dst_argb), // %2
756 "+r"(dst_width) // %3 913 "+r"(dst_width) // %3
757 : "r"(src_stepx * 4) // %4 914 : "r"((int64)(src_stepx * 4)) // %4
758 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 915 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
759 ); 916 );
760 } 917 }
918
919 // TODO(Yang Zhang): Investigate less load instructions for
920 // the x/dx stepping
921 #define LOAD1_DATA32_LANE(vn, n) \
922 "lsr %5, %3, #16 \n" \
923 "add %6, %1, %5, lsl #2 \n" \
924 "add %3, %3, %4 \n" \
925 MEMACCESS(6) \
926 "ld1 {"#vn".s}["#n"], [%6] \n"
927
928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
929 int dst_width, int x, int dx) {
930 const uint8* src_tmp = src_argb;
931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
932 int64 x64 = (int64) x;
933 int64 dx64 = (int64) dx;
934 int64 tmp64 = 0;
935 asm volatile (
936 "1: \n"
937 LOAD1_DATA32_LANE(v0, 0)
938 LOAD1_DATA32_LANE(v0, 1)
939 LOAD1_DATA32_LANE(v0, 2)
940 LOAD1_DATA32_LANE(v0, 3)
941 LOAD1_DATA32_LANE(v1, 0)
942 LOAD1_DATA32_LANE(v1, 1)
943 LOAD1_DATA32_LANE(v1, 2)
944 LOAD1_DATA32_LANE(v1, 3)
945
946 MEMACCESS(0)
947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
948 "subs %w2, %w2, #8 \n" // 8 processed per loop
949 "b.gt 1b \n"
950 : "+r"(dst_argb), // %0
951 "+r"(src_argb), // %1
952 "+r"(dst_width64), // %2
953 "+r"(x64), // %3
954 "+r"(dx64), // %4
955 "+r"(tmp64), // %5
956 "+r"(src_tmp) // %6
957 :
958 : "memory", "cc", "v0", "v1"
959 );
960 }
961
962 #undef LOAD1_DATA32_LANE
963
964 // TODO(Yang Zhang): Investigate less load instructions for
965 // the x/dx stepping
966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \
967 "lsr %5, %3, #16 \n" \
968 "add %6, %1, %5, lsl #2 \n" \
969 "add %3, %3, %4 \n" \
970 MEMACCESS(6) \
971 "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n"
972
973 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
974 int dst_width, int x, int dx) {
975 int dx_offset[4] = {0, 1, 2, 3};
976 int* tmp = dx_offset;
977 const uint8* src_tmp = src_argb;
978 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
979 int64 x64 = (int64) x;
980 int64 dx64 = (int64) dx;
981 asm volatile (
982 "dup v0.4s, %w3 \n" // x
983 "dup v1.4s, %w4 \n" // dx
984 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
985 "shl v6.4s, v1.4s, #2 \n" // 4 * dx
986 "mul v1.4s, v1.4s, v2.4s \n"
987 "movi v3.16b, #0x7f \n" // 0x7F
988 "movi v4.8h, #0x7f \n" // 0x7F
989 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
990 "add v5.4s, v1.4s, v0.4s \n"
991 "1: \n"
992 // d0, d1: a
993 // d2, d3: b
994 LOAD2_DATA32_LANE(v0, v1, 0)
995 LOAD2_DATA32_LANE(v0, v1, 1)
996 LOAD2_DATA32_LANE(v0, v1, 2)
997 LOAD2_DATA32_LANE(v0, v1, 3)
998 "shrn v2.4h, v5.4s, #9 \n"
999 "and v2.8b, v2.8b, v4.8b \n"
1000 "dup v16.8b, v2.b[0] \n"
1001 "dup v17.8b, v2.b[2] \n"
1002 "dup v18.8b, v2.b[4] \n"
1003 "dup v19.8b, v2.b[6] \n"
1004 "ext v2.8b, v16.8b, v17.8b, #4 \n"
1005 "ext v17.8b, v18.8b, v19.8b, #4 \n"
1006 "ins v2.d[1], v17.d[0] \n" // f
1007 "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
1008 "umull v16.8h, v0.8b, v7.8b \n"
1009 "umull2 v17.8h, v0.16b, v7.16b \n"
1010 "umull v18.8h, v1.8b, v2.8b \n"
1011 "umull2 v19.8h, v1.16b, v2.16b \n"
1012 "add v16.8h, v16.8h, v18.8h \n"
1013 "add v17.8h, v17.8h, v19.8h \n"
1014 "shrn v0.8b, v16.8h, #7 \n"
1015 "shrn2 v0.16b, v17.8h, #7 \n"
1016
1017 MEMACCESS(0)
1018 "st1 {v0.4s}, [%0], #16 \n" // store pixels
1019 "add v5.4s, v5.4s, v6.4s \n"
1020 "subs %w2, %w2, #4 \n" // 4 processed per loop
1021 "b.gt 1b \n"
1022 : "+r"(dst_argb), // %0
1023 "+r"(src_argb), // %1
1024 "+r"(dst_width64), // %2
1025 "+r"(x64), // %3
1026 "+r"(dx64), // %4
1027 "+r"(tmp), // %5
1028 "+r"(src_tmp) // %6
1029 :
1030 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
1031 "v6", "v7", "v16", "v17", "v18", "v19"
1032 );
1033 }
1034
1035 #undef LOAD2_DATA32_LANE
1036
761 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
762 1038
763 #ifdef __cplusplus 1039 #ifdef __cplusplus
764 } // extern "C" 1040 } // extern "C"
765 } // namespace libyuv 1041 } // namespace libyuv
766 #endif 1042 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/scale_neon.cc ('k') | source/libvpx/third_party/libyuv/source/scale_posix.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698