OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "libyuv/scale.h" | 11 #include "libyuv/scale.h" |
12 #include "libyuv/row.h" | 12 #include "libyuv/row.h" |
13 #include "libyuv/scale_row.h" | 13 #include "libyuv/scale_row.h" |
14 | 14 |
15 #ifdef __cplusplus | 15 #ifdef __cplusplus |
16 namespace libyuv { | 16 namespace libyuv { |
17 extern "C" { | 17 extern "C" { |
18 #endif | 18 #endif |
19 | 19 |
20 // This module is for GCC Neon armv8 64 bit. | 20 // This module is for GCC Neon armv8 64 bit. |
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
22 | 22 |
23 // Read 32x1 throw away even pixels, and write 16x1. | 23 // Read 32x1 throw away even pixels, and write 16x1. |
24 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 24 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
25 uint8* dst, int dst_width) { | 25 uint8* dst, int dst_width) { |
26 asm volatile ( | 26 asm volatile ( |
27 "1: \n" | 27 "1: \n" |
28 // load even pixels into v0, odd into v1 | 28 // load even pixels into v0, odd into v1 |
29 MEMACCESS(0) | 29 MEMACCESS(0) |
30 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" | 30 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" |
31 "subs %2, %2, #16 \n" // 16 processed per loop | 31 "subs %w2, %w2, #16 \n" // 16 processed per loop |
32 MEMACCESS(1) | 32 MEMACCESS(1) |
33 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels | 33 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels |
34 "b.gt 1b \n" | 34 "b.gt 1b \n" |
35 : "+r"(src_ptr), // %0 | 35 : "+r"(src_ptr), // %0 |
36 "+r"(dst), // %1 | 36 "+r"(dst), // %1 |
37 "+r"(dst_width) // %2 | 37 "+r"(dst_width) // %2 |
38 : | 38 : |
39 : "v0", "v1" // Clobber List | 39 : "v0", "v1" // Clobber List |
40 ); | 40 ); |
41 } | 41 } |
42 | 42 |
| 43 // Read 32x1 average down and write 16x1. |
| 44 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
| 45 uint8* dst, int dst_width) { |
| 46 asm volatile ( |
| 47 "1: \n" |
| 48 MEMACCESS(0) |
| 49 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc |
| 50 "subs %w2, %w2, #16 \n" // 16 processed per loop |
| 51 "uaddlp v0.8h, v0.16b \n" // add adjacent |
| 52 "uaddlp v1.8h, v1.16b \n" |
| 53 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack |
| 54 "rshrn2 v0.16b, v1.8h, #1 \n" |
| 55 MEMACCESS(1) |
| 56 "st1 {v0.16b}, [%1], #16 \n" |
| 57 "b.gt 1b \n" |
| 58 : "+r"(src_ptr), // %0 |
| 59 "+r"(dst), // %1 |
| 60 "+r"(dst_width) // %2 |
| 61 : |
| 62 : "v0", "v1" // Clobber List |
| 63 ); |
| 64 } |
| 65 |
43 // Read 32x2 average down and write 16x1. | 66 // Read 32x2 average down and write 16x1. |
44 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 67 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
45 uint8* dst, int dst_width) { | 68 uint8* dst, int dst_width) { |
46 asm volatile ( | 69 asm volatile ( |
47 // change the stride to row 2 pointer | 70 // change the stride to row 2 pointer |
48 "add %1, %1, %0 \n" | 71 "add %1, %1, %0 \n" |
49 "1: \n" | 72 "1: \n" |
50 MEMACCESS(0) | 73 MEMACCESS(0) |
51 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc | 74 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc |
52 MEMACCESS(1) | 75 MEMACCESS(1) |
53 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc | 76 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc |
54 "subs %3, %3, #16 \n" // 16 processed per loop | 77 "subs %w3, %w3, #16 \n" // 16 processed per loop |
55 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent | 78 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent |
56 "uaddlp v1.8h, v1.16b \n" | 79 "uaddlp v1.8h, v1.16b \n" |
57 "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 | 80 "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 |
58 "uadalp v1.8h, v3.16b \n" | 81 "uadalp v1.8h, v3.16b \n" |
59 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack | 82 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack |
60 "rshrn2 v0.16b, v1.8h, #2 \n" | 83 "rshrn2 v0.16b, v1.8h, #2 \n" |
61 MEMACCESS(2) | 84 MEMACCESS(2) |
62 "st1 {v0.16b}, [%2], #16 \n" | 85 "st1 {v0.16b}, [%2], #16 \n" |
63 "b.gt 1b \n" | 86 "b.gt 1b \n" |
64 : "+r"(src_ptr), // %0 | 87 : "+r"(src_ptr), // %0 |
65 "+r"(src_stride), // %1 | 88 "+r"(src_stride), // %1 |
66 "+r"(dst), // %2 | 89 "+r"(dst), // %2 |
67 "+r"(dst_width) // %3 | 90 "+r"(dst_width) // %3 |
68 : | 91 : |
69 : "v0", "v1", "v2", "v3" // Clobber List | 92 : "v0", "v1", "v2", "v3" // Clobber List |
70 ); | 93 ); |
71 } | 94 } |
72 | 95 |
73 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 96 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
74 uint8* dst_ptr, int dst_width) { | 97 uint8* dst_ptr, int dst_width) { |
75 asm volatile ( | 98 asm volatile ( |
76 "1: \n" | 99 "1: \n" |
77 MEMACCESS(0) | 100 MEMACCESS(0) |
78 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 | 101 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 |
79 "subs %2, %2, #8 \n" // 8 processed per loop | 102 "subs %w2, %w2, #8 \n" // 8 processed per loop |
80 MEMACCESS(1) | 103 MEMACCESS(1) |
81 "st1 {v2.8b}, [%1], #8 \n" | 104 "st1 {v2.8b}, [%1], #8 \n" |
82 "b.gt 1b \n" | 105 "b.gt 1b \n" |
83 : "+r"(src_ptr), // %0 | 106 : "+r"(src_ptr), // %0 |
84 "+r"(dst_ptr), // %1 | 107 "+r"(dst_ptr), // %1 |
85 "+r"(dst_width) // %2 | 108 "+r"(dst_width) // %2 |
86 : | 109 : |
87 : "v0", "v1", "v2", "v3", "memory", "cc" | 110 : "v0", "v1", "v2", "v3", "memory", "cc" |
88 ); | 111 ); |
89 } | 112 } |
90 | 113 |
91 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 114 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
92 uint8* dst_ptr, int dst_width) { | 115 uint8* dst_ptr, int dst_width) { |
93 const uint8* src_ptr1 = src_ptr + src_stride; | 116 const uint8* src_ptr1 = src_ptr + src_stride; |
94 const uint8* src_ptr2 = src_ptr + src_stride * 2; | 117 const uint8* src_ptr2 = src_ptr + src_stride * 2; |
95 const uint8* src_ptr3 = src_ptr + src_stride * 3; | 118 const uint8* src_ptr3 = src_ptr + src_stride * 3; |
96 asm volatile ( | 119 asm volatile ( |
97 "1: \n" | 120 "1: \n" |
98 MEMACCESS(0) | 121 MEMACCESS(0) |
99 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 | 122 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 |
100 MEMACCESS(3) | 123 MEMACCESS(3) |
101 "ld1 {v1.16b}, [%2], #16 \n" | 124 "ld1 {v1.16b}, [%2], #16 \n" |
102 MEMACCESS(4) | 125 MEMACCESS(4) |
103 "ld1 {v2.16b}, [%3], #16 \n" | 126 "ld1 {v2.16b}, [%3], #16 \n" |
104 MEMACCESS(5) | 127 MEMACCESS(5) |
105 "ld1 {v3.16b}, [%4], #16 \n" | 128 "ld1 {v3.16b}, [%4], #16 \n" |
106 "subs %5, %5, #4 \n" | 129 "subs %w5, %w5, #4 \n" |
107 "uaddlp v0.8h, v0.16b \n" | 130 "uaddlp v0.8h, v0.16b \n" |
108 "uadalp v0.8h, v1.16b \n" | 131 "uadalp v0.8h, v1.16b \n" |
109 "uadalp v0.8h, v2.16b \n" | 132 "uadalp v0.8h, v2.16b \n" |
110 "uadalp v0.8h, v3.16b \n" | 133 "uadalp v0.8h, v3.16b \n" |
111 "addp v0.8h, v0.8h, v0.8h \n" | 134 "addp v0.8h, v0.8h, v0.8h \n" |
112 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding | 135 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding |
113 MEMACCESS(1) | 136 MEMACCESS(1) |
114 "st1 {v0.s}[0], [%1], #4 \n" | 137 "st1 {v0.s}[0], [%1], #4 \n" |
115 "b.gt 1b \n" | 138 "b.gt 1b \n" |
116 : "+r"(src_ptr), // %0 | 139 : "+r"(src_ptr), // %0 |
(...skipping 10 matching lines...) Expand all Loading... |
127 // Down scale from 4 to 3 pixels. Use the neon multilane read/write | 150 // Down scale from 4 to 3 pixels. Use the neon multilane read/write |
128 // to load up the every 4th pixel into a 4 different registers. | 151 // to load up the every 4th pixel into a 4 different registers. |
129 // Point samples 32 pixels to 24 pixels. | 152 // Point samples 32 pixels to 24 pixels. |
130 void ScaleRowDown34_NEON(const uint8* src_ptr, | 153 void ScaleRowDown34_NEON(const uint8* src_ptr, |
131 ptrdiff_t src_stride, | 154 ptrdiff_t src_stride, |
132 uint8* dst_ptr, int dst_width) { | 155 uint8* dst_ptr, int dst_width) { |
133 asm volatile ( | 156 asm volatile ( |
134 "1: \n" | 157 "1: \n" |
135 MEMACCESS(0) | 158 MEMACCESS(0) |
136 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l
ine 0 | 159 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l
ine 0 |
137 "subs %2, %2, #24 \n" | 160 "subs %w2, %w2, #24 \n" |
138 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 | 161 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 |
139 MEMACCESS(1) | 162 MEMACCESS(1) |
140 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" | 163 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" |
141 "b.gt 1b \n" | 164 "b.gt 1b \n" |
142 : "+r"(src_ptr), // %0 | 165 : "+r"(src_ptr), // %0 |
143 "+r"(dst_ptr), // %1 | 166 "+r"(dst_ptr), // %1 |
144 "+r"(dst_width) // %2 | 167 "+r"(dst_width) // %2 |
145 : | 168 : |
146 : "v0", "v1", "v2", "v3", "memory", "cc" | 169 : "v0", "v1", "v2", "v3", "memory", "cc" |
147 ); | 170 ); |
148 } | 171 } |
149 | 172 |
150 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, | 173 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, |
151 ptrdiff_t src_stride, | 174 ptrdiff_t src_stride, |
152 uint8* dst_ptr, int dst_width) { | 175 uint8* dst_ptr, int dst_width) { |
153 asm volatile ( | 176 asm volatile ( |
154 "movi v20.8b, #3 \n" | 177 "movi v20.8b, #3 \n" |
155 "add %3, %3, %0 \n" | 178 "add %3, %3, %0 \n" |
156 "1: \n" | 179 "1: \n" |
157 MEMACCESS(0) | 180 MEMACCESS(0) |
158 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l
ine 0 | 181 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l
ine 0 |
159 MEMACCESS(3) | 182 MEMACCESS(3) |
160 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src l
ine 1 | 183 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src l
ine 1 |
161 "subs %2, %2, #24 \n" | 184 "subs %w2, %w2, #24 \n" |
162 | 185 |
163 // filter src line 0 with src line 1 | 186 // filter src line 0 with src line 1 |
164 // expand chars to shorts to allow for room | 187 // expand chars to shorts to allow for room |
165 // when adding lines together | 188 // when adding lines together |
166 "ushll v16.8h, v4.8b, #0 \n" | 189 "ushll v16.8h, v4.8b, #0 \n" |
167 "ushll v17.8h, v5.8b, #0 \n" | 190 "ushll v17.8h, v5.8b, #0 \n" |
168 "ushll v18.8h, v6.8b, #0 \n" | 191 "ushll v18.8h, v6.8b, #0 \n" |
169 "ushll v19.8h, v7.8b, #0 \n" | 192 "ushll v19.8h, v7.8b, #0 \n" |
170 | 193 |
171 // 3 * line_0 + line_1 | 194 // 3 * line_0 + line_1 |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
211 ptrdiff_t src_stride, | 234 ptrdiff_t src_stride, |
212 uint8* dst_ptr, int dst_width) { | 235 uint8* dst_ptr, int dst_width) { |
213 asm volatile ( | 236 asm volatile ( |
214 "movi v20.8b, #3 \n" | 237 "movi v20.8b, #3 \n" |
215 "add %3, %3, %0 \n" | 238 "add %3, %3, %0 \n" |
216 "1: \n" | 239 "1: \n" |
217 MEMACCESS(0) | 240 MEMACCESS(0) |
218 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l
ine 0 | 241 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l
ine 0 |
219 MEMACCESS(3) | 242 MEMACCESS(3) |
220 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src l
ine 1 | 243 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src l
ine 1 |
221 "subs %2, %2, #24 \n" | 244 "subs %w2, %w2, #24 \n" |
222 // average src line 0 with src line 1 | 245 // average src line 0 with src line 1 |
223 "urhadd v0.8b, v0.8b, v4.8b \n" | 246 "urhadd v0.8b, v0.8b, v4.8b \n" |
224 "urhadd v1.8b, v1.8b, v5.8b \n" | 247 "urhadd v1.8b, v1.8b, v5.8b \n" |
225 "urhadd v2.8b, v2.8b, v6.8b \n" | 248 "urhadd v2.8b, v2.8b, v6.8b \n" |
226 "urhadd v3.8b, v3.8b, v7.8b \n" | 249 "urhadd v3.8b, v3.8b, v7.8b \n" |
227 | 250 |
228 // a0 = (src[0] * 3 + s[1] * 1) >> 2 | 251 // a0 = (src[0] * 3 + s[1] * 1) >> 2 |
229 "ushll v4.8h, v1.8b, #0 \n" | 252 "ushll v4.8h, v1.8b, #0 \n" |
230 "umlal v4.8h, v0.8b, v20.8b \n" | 253 "umlal v4.8h, v0.8b, v20.8b \n" |
231 "uqrshrn v0.8b, v4.8h, #2 \n" | 254 "uqrshrn v0.8b, v4.8h, #2 \n" |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
264 // 32 -> 12 | 287 // 32 -> 12 |
265 void ScaleRowDown38_NEON(const uint8* src_ptr, | 288 void ScaleRowDown38_NEON(const uint8* src_ptr, |
266 ptrdiff_t src_stride, | 289 ptrdiff_t src_stride, |
267 uint8* dst_ptr, int dst_width) { | 290 uint8* dst_ptr, int dst_width) { |
268 asm volatile ( | 291 asm volatile ( |
269 MEMACCESS(3) | 292 MEMACCESS(3) |
270 "ld1 {v3.16b}, [%3] \n" | 293 "ld1 {v3.16b}, [%3] \n" |
271 "1: \n" | 294 "1: \n" |
272 MEMACCESS(0) | 295 MEMACCESS(0) |
273 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" | 296 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" |
274 "subs %2, %2, #12 \n" | 297 "subs %w2, %w2, #12 \n" |
275 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" | 298 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" |
276 MEMACCESS(1) | 299 MEMACCESS(1) |
277 "st1 {v2.8b}, [%1], #8 \n" | 300 "st1 {v2.8b}, [%1], #8 \n" |
278 MEMACCESS(1) | 301 MEMACCESS(1) |
279 "st1 {v2.s}[2], [%1], #4 \n" | 302 "st1 {v2.s}[2], [%1], #4 \n" |
280 "b.gt 1b \n" | 303 "b.gt 1b \n" |
281 : "+r"(src_ptr), // %0 | 304 : "+r"(src_ptr), // %0 |
282 "+r"(dst_ptr), // %1 | 305 "+r"(dst_ptr), // %1 |
283 "+r"(dst_width) // %2 | 306 "+r"(dst_width) // %2 |
284 : "r"(&kShuf38) // %3 | 307 : "r"(&kShuf38) // %3 |
(...skipping 21 matching lines...) Expand all Loading... |
306 // 00 40 01 41 02 42 03 43 | 329 // 00 40 01 41 02 42 03 43 |
307 // 10 50 11 51 12 52 13 53 | 330 // 10 50 11 51 12 52 13 53 |
308 // 20 60 21 61 22 62 23 63 | 331 // 20 60 21 61 22 62 23 63 |
309 // 30 70 31 71 32 72 33 73 | 332 // 30 70 31 71 32 72 33 73 |
310 MEMACCESS(0) | 333 MEMACCESS(0) |
311 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" | 334 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" |
312 MEMACCESS(3) | 335 MEMACCESS(3) |
313 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" | 336 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" |
314 MEMACCESS(4) | 337 MEMACCESS(4) |
315 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" | 338 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" |
316 "subs %4, %4, #12 \n" | 339 "subs %w4, %w4, #12 \n" |
317 | 340 |
318 // Shuffle the input data around to get align the data | 341 // Shuffle the input data around to get align the data |
319 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 | 342 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 |
320 // 00 10 01 11 02 12 03 13 | 343 // 00 10 01 11 02 12 03 13 |
321 // 40 50 41 51 42 52 43 53 | 344 // 40 50 41 51 42 52 43 53 |
322 "trn1 v20.8b, v0.8b, v1.8b \n" | 345 "trn1 v20.8b, v0.8b, v1.8b \n" |
323 "trn2 v21.8b, v0.8b, v1.8b \n" | 346 "trn2 v21.8b, v0.8b, v1.8b \n" |
324 "trn1 v22.8b, v4.8b, v5.8b \n" | 347 "trn1 v22.8b, v4.8b, v5.8b \n" |
325 "trn2 v23.8b, v4.8b, v5.8b \n" | 348 "trn2 v23.8b, v4.8b, v5.8b \n" |
326 "trn1 v24.8b, v16.8b, v17.8b \n" | 349 "trn1 v24.8b, v16.8b, v17.8b \n" |
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
430 "1: \n" | 453 "1: \n" |
431 | 454 |
432 // 00 40 01 41 02 42 03 43 | 455 // 00 40 01 41 02 42 03 43 |
433 // 10 50 11 51 12 52 13 53 | 456 // 10 50 11 51 12 52 13 53 |
434 // 20 60 21 61 22 62 23 63 | 457 // 20 60 21 61 22 62 23 63 |
435 // 30 70 31 71 32 72 33 73 | 458 // 30 70 31 71 32 72 33 73 |
436 MEMACCESS(0) | 459 MEMACCESS(0) |
437 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" | 460 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" |
438 MEMACCESS(3) | 461 MEMACCESS(3) |
439 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" | 462 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" |
440 "subs %3, %3, #12 \n" | 463 "subs %w3, %w3, #12 \n" |
441 | 464 |
442 // Shuffle the input data around to get align the data | 465 // Shuffle the input data around to get align the data |
443 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 | 466 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 |
444 // 00 10 01 11 02 12 03 13 | 467 // 00 10 01 11 02 12 03 13 |
445 // 40 50 41 51 42 52 43 53 | 468 // 40 50 41 51 42 52 43 53 |
446 "trn1 v16.8b, v0.8b, v1.8b \n" | 469 "trn1 v16.8b, v0.8b, v1.8b \n" |
447 "trn2 v17.8b, v0.8b, v1.8b \n" | 470 "trn2 v17.8b, v0.8b, v1.8b \n" |
448 "trn1 v18.8b, v4.8b, v5.8b \n" | 471 "trn1 v18.8b, v4.8b, v5.8b \n" |
449 "trn2 v19.8b, v4.8b, v5.8b \n" | 472 "trn2 v19.8b, v4.8b, v5.8b \n" |
450 | 473 |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
515 "+r"(dst_ptr), // %1 | 538 "+r"(dst_ptr), // %1 |
516 "+r"(tmp_src_stride), // %2 | 539 "+r"(tmp_src_stride), // %2 |
517 "+r"(dst_width) // %3 | 540 "+r"(dst_width) // %3 |
518 : "r"(&kMult38_Div6), // %4 | 541 : "r"(&kMult38_Div6), // %4 |
519 "r"(&kShuf38_2) // %5 | 542 "r"(&kShuf38_2) // %5 |
520 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", | 543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", |
521 "v18", "v19", "v30", "v31", "memory", "cc" | 544 "v18", "v19", "v30", "v31", "memory", "cc" |
522 ); | 545 ); |
523 } | 546 } |
524 | 547 |
| 548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
| 549 uint16* dst_ptr, int src_width, int src_height) { |
| 550 const uint8* src_tmp = NULL; |
| 551 asm volatile ( |
| 552 "1: \n" |
| 553 "mov %0, %1 \n" |
| 554 "mov w12, %w5 \n" |
| 555 "eor v2.16b, v2.16b, v2.16b \n" |
| 556 "eor v3.16b, v3.16b, v3.16b \n" |
| 557 "2: \n" |
| 558 // load 16 pixels into q0 |
| 559 MEMACCESS(0) |
| 560 "ld1 {v0.16b}, [%0], %3 \n" |
| 561 "uaddw2 v3.8h, v3.8h, v0.16b \n" |
| 562 "uaddw v2.8h, v2.8h, v0.8b \n" |
| 563 "subs w12, w12, #1 \n" |
| 564 "b.gt 2b \n" |
| 565 MEMACCESS(2) |
| 566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels |
| 567 "add %1, %1, #16 \n" |
| 568 "subs %w4, %w4, #16 \n" // 16 processed per loop |
| 569 "b.gt 1b \n" |
| 570 : "+r"(src_tmp), // %0 |
| 571 "+r"(src_ptr), // %1 |
| 572 "+r"(dst_ptr), // %2 |
| 573 "+r"(src_stride), // %3 |
| 574 "+r"(src_width), // %4 |
| 575 "+r"(src_height) // %5 |
| 576 : |
| 577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List |
| 578 ); |
| 579 } |
| 580 |
| 581 // TODO(Yang Zhang): Investigate less load instructions for |
| 582 // the x/dx stepping |
| 583 #define LOAD2_DATA8_LANE(n) \ |
| 584 "lsr %5, %3, #16 \n" \ |
| 585 "add %6, %1, %5 \n" \ |
| 586 "add %3, %3, %4 \n" \ |
| 587 MEMACCESS(6) \ |
| 588 "ld2 {v4.b, v5.b}["#n"], [%6] \n" |
| 589 |
| 590 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, |
| 591 int dst_width, int x, int dx) { |
| 592 int dx_offset[4] = {0, 1, 2, 3}; |
| 593 int* tmp = dx_offset; |
| 594 const uint8* src_tmp = src_ptr; |
| 595 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. |
| 596 int64 x64 = (int64) x; |
| 597 int64 dx64 = (int64) dx; |
| 598 asm volatile ( |
| 599 "dup v0.4s, %w3 \n" // x |
| 600 "dup v1.4s, %w4 \n" // dx |
| 601 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 |
| 602 "shl v3.4s, v1.4s, #2 \n" // 4 * dx |
| 603 "mul v1.4s, v1.4s, v2.4s \n" |
| 604 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx |
| 605 "add v1.4s, v1.4s, v0.4s \n" |
| 606 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx |
| 607 "add v2.4s, v1.4s, v3.4s \n" |
| 608 "shl v0.4s, v3.4s, #1 \n" // 8 * dx |
| 609 "1: \n" |
| 610 LOAD2_DATA8_LANE(0) |
| 611 LOAD2_DATA8_LANE(1) |
| 612 LOAD2_DATA8_LANE(2) |
| 613 LOAD2_DATA8_LANE(3) |
| 614 LOAD2_DATA8_LANE(4) |
| 615 LOAD2_DATA8_LANE(5) |
| 616 LOAD2_DATA8_LANE(6) |
| 617 LOAD2_DATA8_LANE(7) |
| 618 "mov v6.16b, v1.16b \n" |
| 619 "mov v7.16b, v2.16b \n" |
| 620 "uzp1 v6.8h, v6.8h, v7.8h \n" |
| 621 "ushll v4.8h, v4.8b, #0 \n" |
| 622 "ushll v5.8h, v5.8b, #0 \n" |
| 623 "ssubl v16.4s, v5.4h, v4.4h \n" |
| 624 "ssubl2 v17.4s, v5.8h, v4.8h \n" |
| 625 "ushll v7.4s, v6.4h, #0 \n" |
| 626 "ushll2 v6.4s, v6.8h, #0 \n" |
| 627 "mul v16.4s, v16.4s, v7.4s \n" |
| 628 "mul v17.4s, v17.4s, v6.4s \n" |
| 629 "shrn v6.4h, v16.4s, #16 \n" |
| 630 "shrn2 v6.8h, v17.4s, #16 \n" |
| 631 "add v4.8h, v4.8h, v6.8h \n" |
| 632 "xtn v4.8b, v4.8h \n" |
| 633 |
| 634 MEMACCESS(0) |
| 635 "st1 {v4.8b}, [%0], #8 \n" // store pixels |
| 636 "add v1.4s, v1.4s, v0.4s \n" |
| 637 "add v2.4s, v2.4s, v0.4s \n" |
| 638 "subs %w2, %w2, #8 \n" // 8 processed per loop |
| 639 "b.gt 1b \n" |
| 640 : "+r"(dst_ptr), // %0 |
| 641 "+r"(src_ptr), // %1 |
| 642 "+r"(dst_width64), // %2 |
| 643 "+r"(x64), // %3 |
| 644 "+r"(dx64), // %4 |
| 645 "+r"(tmp), // %5 |
| 646 "+r"(src_tmp) // %6 |
| 647 : |
| 648 : "memory", "cc", "v0", "v1", "v2", "v3", |
| 649 "v4", "v5", "v6", "v7", "v16", "v17" |
| 650 ); |
| 651 } |
| 652 |
| 653 #undef LOAD2_DATA8_LANE |
| 654 |
525 // 16x2 -> 16x1 | 655 // 16x2 -> 16x1 |
526 void ScaleFilterRows_NEON(uint8* dst_ptr, | 656 void ScaleFilterRows_NEON(uint8* dst_ptr, |
527 const uint8* src_ptr, ptrdiff_t src_stride, | 657 const uint8* src_ptr, ptrdiff_t src_stride, |
528 int dst_width, int source_y_fraction) { | 658 int dst_width, int source_y_fraction) { |
529 int y_fraction = 256 - source_y_fraction; | 659 int y_fraction = 256 - source_y_fraction; |
530 asm volatile ( | 660 asm volatile ( |
531 "cmp %4, #0 \n" | 661 "cmp %w4, #0 \n" |
532 "b.eq 100f \n" | 662 "b.eq 100f \n" |
533 "add %2, %2, %1 \n" | 663 "add %2, %2, %1 \n" |
534 "cmp %4, #64 \n" | 664 "cmp %w4, #64 \n" |
535 "b.eq 75f \n" | 665 "b.eq 75f \n" |
536 "cmp %4, #128 \n" | 666 "cmp %w4, #128 \n" |
537 "b.eq 50f \n" | 667 "b.eq 50f \n" |
538 "cmp %4, #192 \n" | 668 "cmp %w4, #192 \n" |
539 "b.eq 25f \n" | 669 "b.eq 25f \n" |
540 | 670 |
541 "dup v5.8b, %w4 \n" | 671 "dup v5.8b, %w4 \n" |
542 "dup v4.8b, %w5 \n" | 672 "dup v4.8b, %w5 \n" |
543 // General purpose row blend. | 673 // General purpose row blend. |
544 "1: \n" | 674 "1: \n" |
545 MEMACCESS(1) | 675 MEMACCESS(1) |
546 "ld1 {v0.16b}, [%1], #16 \n" | 676 "ld1 {v0.16b}, [%1], #16 \n" |
547 MEMACCESS(2) | 677 MEMACCESS(2) |
548 "ld1 {v1.16b}, [%2], #16 \n" | 678 "ld1 {v1.16b}, [%2], #16 \n" |
549 "subs %3, %3, #16 \n" | 679 "subs %w3, %w3, #16 \n" |
550 "umull v6.8h, v0.8b, v4.8b \n" | 680 "umull v6.8h, v0.8b, v4.8b \n" |
551 "umull2 v7.8h, v0.16b, v4.16b \n" | 681 "umull2 v7.8h, v0.16b, v4.16b \n" |
552 "umlal v6.8h, v1.8b, v5.8b \n" | 682 "umlal v6.8h, v1.8b, v5.8b \n" |
553 "umlal2 v7.8h, v1.16b, v5.16b \n" | 683 "umlal2 v7.8h, v1.16b, v5.16b \n" |
554 "rshrn v0.8b, v6.8h, #8 \n" | 684 "rshrn v0.8b, v6.8h, #8 \n" |
555 "rshrn2 v0.16b, v7.8h, #8 \n" | 685 "rshrn2 v0.16b, v7.8h, #8 \n" |
556 MEMACCESS(0) | 686 MEMACCESS(0) |
557 "st1 {v0.16b}, [%0], #16 \n" | 687 "st1 {v0.16b}, [%0], #16 \n" |
558 "b.gt 1b \n" | 688 "b.gt 1b \n" |
559 "b 99f \n" | 689 "b 99f \n" |
560 | 690 |
561 // Blend 25 / 75. | 691 // Blend 25 / 75. |
562 "25: \n" | 692 "25: \n" |
563 MEMACCESS(1) | 693 MEMACCESS(1) |
564 "ld1 {v0.16b}, [%1], #16 \n" | 694 "ld1 {v0.16b}, [%1], #16 \n" |
565 MEMACCESS(2) | 695 MEMACCESS(2) |
566 "ld1 {v1.16b}, [%2], #16 \n" | 696 "ld1 {v1.16b}, [%2], #16 \n" |
567 "subs %3, %3, #16 \n" | 697 "subs %w3, %w3, #16 \n" |
568 "urhadd v0.16b, v0.16b, v1.16b \n" | 698 "urhadd v0.16b, v0.16b, v1.16b \n" |
569 "urhadd v0.16b, v0.16b, v1.16b \n" | 699 "urhadd v0.16b, v0.16b, v1.16b \n" |
570 MEMACCESS(0) | 700 MEMACCESS(0) |
571 "st1 {v0.16b}, [%0], #16 \n" | 701 "st1 {v0.16b}, [%0], #16 \n" |
572 "b.gt 25b \n" | 702 "b.gt 25b \n" |
573 "b 99f \n" | 703 "b 99f \n" |
574 | 704 |
575 // Blend 50 / 50. | 705 // Blend 50 / 50. |
576 "50: \n" | 706 "50: \n" |
577 MEMACCESS(1) | 707 MEMACCESS(1) |
578 "ld1 {v0.16b}, [%1], #16 \n" | 708 "ld1 {v0.16b}, [%1], #16 \n" |
579 MEMACCESS(2) | 709 MEMACCESS(2) |
580 "ld1 {v1.16b}, [%2], #16 \n" | 710 "ld1 {v1.16b}, [%2], #16 \n" |
581 "subs %3, %3, #16 \n" | 711 "subs %w3, %w3, #16 \n" |
582 "urhadd v0.16b, v0.16b, v1.16b \n" | 712 "urhadd v0.16b, v0.16b, v1.16b \n" |
583 MEMACCESS(0) | 713 MEMACCESS(0) |
584 "st1 {v0.16b}, [%0], #16 \n" | 714 "st1 {v0.16b}, [%0], #16 \n" |
585 "b.gt 50b \n" | 715 "b.gt 50b \n" |
586 "b 99f \n" | 716 "b 99f \n" |
587 | 717 |
588 // Blend 75 / 25. | 718 // Blend 75 / 25. |
589 "75: \n" | 719 "75: \n" |
590 MEMACCESS(1) | 720 MEMACCESS(1) |
591 "ld1 {v1.16b}, [%1], #16 \n" | 721 "ld1 {v1.16b}, [%1], #16 \n" |
592 MEMACCESS(2) | 722 MEMACCESS(2) |
593 "ld1 {v0.16b}, [%2], #16 \n" | 723 "ld1 {v0.16b}, [%2], #16 \n" |
594 "subs %3, %3, #16 \n" | 724 "subs %w3, %w3, #16 \n" |
595 "urhadd v0.16b, v0.16b, v1.16b \n" | 725 "urhadd v0.16b, v0.16b, v1.16b \n" |
596 "urhadd v0.16b, v0.16b, v1.16b \n" | 726 "urhadd v0.16b, v0.16b, v1.16b \n" |
597 MEMACCESS(0) | 727 MEMACCESS(0) |
598 "st1 {v0.16b}, [%0], #16 \n" | 728 "st1 {v0.16b}, [%0], #16 \n" |
599 "b.gt 75b \n" | 729 "b.gt 75b \n" |
600 "b 99f \n" | 730 "b 99f \n" |
601 | 731 |
602 // Blend 100 / 0 - Copy row unchanged. | 732 // Blend 100 / 0 - Copy row unchanged. |
603 "100: \n" | 733 "100: \n" |
604 MEMACCESS(1) | 734 MEMACCESS(1) |
605 "ld1 {v0.16b}, [%1], #16 \n" | 735 "ld1 {v0.16b}, [%1], #16 \n" |
606 "subs %3, %3, #16 \n" | 736 "subs %w3, %w3, #16 \n" |
607 MEMACCESS(0) | 737 MEMACCESS(0) |
608 "st1 {v0.16b}, [%0], #16 \n" | 738 "st1 {v0.16b}, [%0], #16 \n" |
609 "b.gt 100b \n" | 739 "b.gt 100b \n" |
610 | 740 |
611 "99: \n" | 741 "99: \n" |
612 MEMACCESS(0) | 742 MEMACCESS(0) |
613 "st1 {v0.b}[15], [%0] \n" | 743 "st1 {v0.b}[15], [%0] \n" |
614 : "+r"(dst_ptr), // %0 | 744 : "+r"(dst_ptr), // %0 |
615 "+r"(src_ptr), // %1 | 745 "+r"(src_ptr), // %1 |
616 "+r"(src_stride), // %2 | 746 "+r"(src_stride), // %2 |
617 "+r"(dst_width), // %3 | 747 "+r"(dst_width), // %3 |
618 "+r"(source_y_fraction),// %4 | 748 "+r"(source_y_fraction),// %4 |
619 "+r"(y_fraction) // %5 | 749 "+r"(y_fraction) // %5 |
620 : | 750 : |
621 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" | 751 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" |
622 ); | 752 ); |
623 } | 753 } |
624 | 754 |
625 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 755 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
626 uint8* dst, int dst_width) { | 756 uint8* dst, int dst_width) { |
627 asm volatile ( | 757 asm volatile ( |
628 "1: \n" | 758 "1: \n" |
629 // load even pixels into q0, odd into q1 | 759 // load even pixels into q0, odd into q1 |
630 MEMACCESS (0) | 760 MEMACCESS (0) |
631 "ld2 {v0.4s, v1.4s}, [%0], #32 \n" | 761 "ld2 {v0.4s, v1.4s}, [%0], #32 \n" |
632 MEMACCESS (0) | 762 MEMACCESS (0) |
633 "ld2 {v2.4s, v3.4s}, [%0], #32 \n" | 763 "ld2 {v2.4s, v3.4s}, [%0], #32 \n" |
634 "subs %2, %2, #8 \n" // 8 processed per loop | 764 "subs %w2, %w2, #8 \n" // 8 processed per loop |
635 MEMACCESS (1) | 765 MEMACCESS (1) |
636 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels | 766 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels |
637 MEMACCESS (1) | 767 MEMACCESS (1) |
638 "st1 {v3.16b}, [%1], #16 \n" | 768 "st1 {v3.16b}, [%1], #16 \n" |
639 "b.gt 1b \n" | 769 "b.gt 1b \n" |
640 : "+r" (src_ptr), // %0 | 770 : "+r" (src_ptr), // %0 |
641 "+r" (dst), // %1 | 771 "+r" (dst), // %1 |
642 "+r" (dst_width) // %2 | 772 "+r" (dst_width) // %2 |
643 : | 773 : |
644 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List | 774 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List |
645 ); | 775 ); |
646 } | 776 } |
647 | 777 |
| 778 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, |
| 779 uint8* dst_argb, int dst_width) { |
| 780 asm volatile ( |
| 781 "1: \n" |
| 782 MEMACCESS (0) |
| 783 // load 8 ARGB pixels. |
| 784 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" |
| 785 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 786 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
| 787 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| 788 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
| 789 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. |
| 790 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack |
| 791 "rshrn v1.8b, v1.8h, #1 \n" |
| 792 "rshrn v2.8b, v2.8h, #1 \n" |
| 793 "rshrn v3.8b, v3.8h, #1 \n" |
| 794 MEMACCESS (1) |
| 795 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" |
| 796 "b.gt 1b \n" |
| 797 : "+r"(src_argb), // %0 |
| 798 "+r"(dst_argb), // %1 |
| 799 "+r"(dst_width) // %2 |
| 800 : |
| 801 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List |
| 802 ); |
| 803 } |
| 804 |
648 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, | 805 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
649 uint8* dst, int dst_width) { | 806 uint8* dst, int dst_width) { |
650 asm volatile ( | 807 asm volatile ( |
651 // change the stride to row 2 pointer | 808 // change the stride to row 2 pointer |
652 "add %1, %1, %0 \n" | 809 "add %1, %1, %0 \n" |
653 "1: \n" | 810 "1: \n" |
654 MEMACCESS (0) | 811 MEMACCESS (0) |
655 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB p
ixels. | 812 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB p
ixels. |
656 "subs %3, %3, #8 \n" // 8 processed per loop. | 813 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
657 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 814 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
658 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 815 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
659 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 816 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
660 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. | 817 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. |
661 MEMACCESS (1) | 818 MEMACCESS (1) |
662 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more
ARGB pixels. | 819 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more
ARGB pixels. |
663 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. | 820 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. |
664 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. | 821 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. |
665 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. | 822 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. |
666 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. | 823 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. |
(...skipping 20 matching lines...) Expand all Loading... |
687 asm volatile ( | 844 asm volatile ( |
688 "1: \n" | 845 "1: \n" |
689 MEMACCESS(0) | 846 MEMACCESS(0) |
690 "ld1 {v0.s}[0], [%0], %3 \n" | 847 "ld1 {v0.s}[0], [%0], %3 \n" |
691 MEMACCESS(0) | 848 MEMACCESS(0) |
692 "ld1 {v0.s}[1], [%0], %3 \n" | 849 "ld1 {v0.s}[1], [%0], %3 \n" |
693 MEMACCESS(0) | 850 MEMACCESS(0) |
694 "ld1 {v0.s}[2], [%0], %3 \n" | 851 "ld1 {v0.s}[2], [%0], %3 \n" |
695 MEMACCESS(0) | 852 MEMACCESS(0) |
696 "ld1 {v0.s}[3], [%0], %3 \n" | 853 "ld1 {v0.s}[3], [%0], %3 \n" |
697 "subs %2, %2, #4 \n" // 4 pixels per loop. | 854 "subs %w2, %w2, #4 \n" // 4 pixels per loop. |
698 MEMACCESS(1) | 855 MEMACCESS(1) |
699 "st1 {v0.16b}, [%1], #16 \n" | 856 "st1 {v0.16b}, [%1], #16 \n" |
700 "b.gt 1b \n" | 857 "b.gt 1b \n" |
701 : "+r"(src_argb), // %0 | 858 : "+r"(src_argb), // %0 |
702 "+r"(dst_argb), // %1 | 859 "+r"(dst_argb), // %1 |
703 "+r"(dst_width) // %2 | 860 "+r"(dst_width) // %2 |
704 : "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3 | 861 : "r"((int64)(src_stepx * 4)) // %3 |
705 : "memory", "cc", "v0" | 862 : "memory", "cc", "v0" |
706 ); | 863 ); |
707 } | 864 } |
708 | 865 |
709 // Reads 4 pixels at a time. | 866 // Reads 4 pixels at a time. |
710 // Alignment requirement: src_argb 4 byte aligned. | 867 // Alignment requirement: src_argb 4 byte aligned. |
711 // TODO, might be worth another optimization pass in future. | 868 // TODO(Yang Zhang): Might be worth another optimization pass in future. |
712 // It could be upgraded to 8 pixels at a time to start with. | 869 // It could be upgraded to 8 pixels at a time to start with. |
713 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, | 870 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, |
714 int src_stepx, | 871 int src_stepx, |
715 uint8* dst_argb, int dst_width) { | 872 uint8* dst_argb, int dst_width) { |
716 asm volatile ( | 873 asm volatile ( |
717 "add %1, %1, %0 \n" | 874 "add %1, %1, %0 \n" |
718 "1: \n" | 875 "1: \n" |
719 MEMACCESS(0) | 876 MEMACCESS(0) |
720 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 | 877 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 |
721 MEMACCESS(1) | 878 MEMACCESS(1) |
722 "ld1 {v1.8b}, [%1], %4 \n" | 879 "ld1 {v1.8b}, [%1], %4 \n" |
723 MEMACCESS(0) | 880 MEMACCESS(0) |
724 "ld1 {v2.8b}, [%0], %4 \n" | 881 "ld1 {v2.8b}, [%0], %4 \n" |
725 MEMACCESS(1) | 882 MEMACCESS(1) |
726 "ld1 {v3.8b}, [%1], %4 \n" | 883 "ld1 {v3.8b}, [%1], %4 \n" |
727 MEMACCESS(0) | 884 MEMACCESS(0) |
728 "ld1 {v4.8b}, [%0], %4 \n" | 885 "ld1 {v4.8b}, [%0], %4 \n" |
729 MEMACCESS(1) | 886 MEMACCESS(1) |
730 "ld1 {v5.8b}, [%1], %4 \n" | 887 "ld1 {v5.8b}, [%1], %4 \n" |
731 MEMACCESS(0) | 888 MEMACCESS(0) |
732 "ld1 {v6.8b}, [%0], %4 \n" | 889 "ld1 {v6.8b}, [%0], %4 \n" |
733 MEMACCESS(1) | 890 MEMACCESS(1) |
734 "ld1 {v7.8b}, [%1], %4 \n" | 891 "ld1 {v7.8b}, [%1], %4 \n" |
735 "uaddl v0.8h, v0.8b, v1.8b \n" | 892 "uaddl v0.8h, v0.8b, v1.8b \n" |
736 "uaddl v2.8h, v2.8b, v3.8b \n" | 893 "uaddl v2.8h, v2.8b, v3.8b \n" |
737 "uaddl v4.8h, v4.8b, v5.8b \n" | 894 "uaddl v4.8h, v4.8b, v5.8b \n" |
738 "uaddl v6.8h, v6.8b, v7.8b \n" | 895 "uaddl v6.8h, v6.8b, v7.8b \n" |
739 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd | 896 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd |
740 "mov v0.d[1], v2.d[0] \n" | 897 "mov v0.d[1], v2.d[0] \n" |
741 "mov v2.d[0], v16.d[1] \n" | 898 "mov v2.d[0], v16.d[1] \n" |
742 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh | 899 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh |
743 "mov v4.d[1], v6.d[0] \n" | 900 "mov v4.d[1], v6.d[0] \n" |
744 "mov v6.d[0], v16.d[1] \n" | 901 "mov v6.d[0], v16.d[1] \n" |
745 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) | 902 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) |
746 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) | 903 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) |
747 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. | 904 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. |
748 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. | 905 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. |
749 "subs %3, %3, #4 \n" // 4 pixels per loop. | 906 "subs %w3, %w3, #4 \n" // 4 pixels per loop. |
750 MEMACCESS(2) | 907 MEMACCESS(2) |
751 "st1 {v0.16b}, [%2], #16 \n" | 908 "st1 {v0.16b}, [%2], #16 \n" |
752 "b.gt 1b \n" | 909 "b.gt 1b \n" |
753 : "+r"(src_argb), // %0 | 910 : "+r"(src_argb), // %0 |
754 "+r"(src_stride), // %1 | 911 "+r"(src_stride), // %1 |
755 "+r"(dst_argb), // %2 | 912 "+r"(dst_argb), // %2 |
756 "+r"(dst_width) // %3 | 913 "+r"(dst_width) // %3 |
757 : "r"(src_stepx * 4) // %4 | 914 : "r"((int64)(src_stepx * 4)) // %4 |
758 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 915 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
759 ); | 916 ); |
760 } | 917 } |
| 918 |
| 919 // TODO(Yang Zhang): Investigate less load instructions for |
| 920 // the x/dx stepping |
| 921 #define LOAD1_DATA32_LANE(vn, n) \ |
| 922 "lsr %5, %3, #16 \n" \ |
| 923 "add %6, %1, %5, lsl #2 \n" \ |
| 924 "add %3, %3, %4 \n" \ |
| 925 MEMACCESS(6) \ |
| 926 "ld1 {"#vn".s}["#n"], [%6] \n" |
| 927 |
| 928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, |
| 929 int dst_width, int x, int dx) { |
| 930 const uint8* src_tmp = src_argb; |
| 931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. |
| 932 int64 x64 = (int64) x; |
| 933 int64 dx64 = (int64) dx; |
| 934 int64 tmp64 = 0; |
| 935 asm volatile ( |
| 936 "1: \n" |
| 937 LOAD1_DATA32_LANE(v0, 0) |
| 938 LOAD1_DATA32_LANE(v0, 1) |
| 939 LOAD1_DATA32_LANE(v0, 2) |
| 940 LOAD1_DATA32_LANE(v0, 3) |
| 941 LOAD1_DATA32_LANE(v1, 0) |
| 942 LOAD1_DATA32_LANE(v1, 1) |
| 943 LOAD1_DATA32_LANE(v1, 2) |
| 944 LOAD1_DATA32_LANE(v1, 3) |
| 945 |
| 946 MEMACCESS(0) |
| 947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels |
| 948 "subs %w2, %w2, #8 \n" // 8 processed per loop |
| 949 "b.gt 1b \n" |
| 950 : "+r"(dst_argb), // %0 |
| 951 "+r"(src_argb), // %1 |
| 952 "+r"(dst_width64), // %2 |
| 953 "+r"(x64), // %3 |
| 954 "+r"(dx64), // %4 |
| 955 "+r"(tmp64), // %5 |
| 956 "+r"(src_tmp) // %6 |
| 957 : |
| 958 : "memory", "cc", "v0", "v1" |
| 959 ); |
| 960 } |
| 961 |
| 962 #undef LOAD1_DATA32_LANE |
| 963 |
| 964 // TODO(Yang Zhang): Investigate less load instructions for |
| 965 // the x/dx stepping |
| 966 #define LOAD2_DATA32_LANE(vn1, vn2, n) \ |
| 967 "lsr %5, %3, #16 \n" \ |
| 968 "add %6, %1, %5, lsl #2 \n" \ |
| 969 "add %3, %3, %4 \n" \ |
| 970 MEMACCESS(6) \ |
| 971 "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n" |
| 972 |
| 973 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, |
| 974 int dst_width, int x, int dx) { |
| 975 int dx_offset[4] = {0, 1, 2, 3}; |
| 976 int* tmp = dx_offset; |
| 977 const uint8* src_tmp = src_argb; |
| 978 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. |
| 979 int64 x64 = (int64) x; |
| 980 int64 dx64 = (int64) dx; |
| 981 asm volatile ( |
| 982 "dup v0.4s, %w3 \n" // x |
| 983 "dup v1.4s, %w4 \n" // dx |
| 984 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 |
| 985 "shl v6.4s, v1.4s, #2 \n" // 4 * dx |
| 986 "mul v1.4s, v1.4s, v2.4s \n" |
| 987 "movi v3.16b, #0x7f \n" // 0x7F |
| 988 "movi v4.8h, #0x7f \n" // 0x7F |
| 989 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx |
| 990 "add v5.4s, v1.4s, v0.4s \n" |
| 991 "1: \n" |
| 992 // d0, d1: a |
| 993 // d2, d3: b |
| 994 LOAD2_DATA32_LANE(v0, v1, 0) |
| 995 LOAD2_DATA32_LANE(v0, v1, 1) |
| 996 LOAD2_DATA32_LANE(v0, v1, 2) |
| 997 LOAD2_DATA32_LANE(v0, v1, 3) |
| 998 "shrn v2.4h, v5.4s, #9 \n" |
| 999 "and v2.8b, v2.8b, v4.8b \n" |
| 1000 "dup v16.8b, v2.b[0] \n" |
| 1001 "dup v17.8b, v2.b[2] \n" |
| 1002 "dup v18.8b, v2.b[4] \n" |
| 1003 "dup v19.8b, v2.b[6] \n" |
| 1004 "ext v2.8b, v16.8b, v17.8b, #4 \n" |
| 1005 "ext v17.8b, v18.8b, v19.8b, #4 \n" |
| 1006 "ins v2.d[1], v17.d[0] \n" // f |
| 1007 "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f |
| 1008 "umull v16.8h, v0.8b, v7.8b \n" |
| 1009 "umull2 v17.8h, v0.16b, v7.16b \n" |
| 1010 "umull v18.8h, v1.8b, v2.8b \n" |
| 1011 "umull2 v19.8h, v1.16b, v2.16b \n" |
| 1012 "add v16.8h, v16.8h, v18.8h \n" |
| 1013 "add v17.8h, v17.8h, v19.8h \n" |
| 1014 "shrn v0.8b, v16.8h, #7 \n" |
| 1015 "shrn2 v0.16b, v17.8h, #7 \n" |
| 1016 |
| 1017 MEMACCESS(0) |
| 1018 "st1 {v0.4s}, [%0], #16 \n" // store pixels |
| 1019 "add v5.4s, v5.4s, v6.4s \n" |
| 1020 "subs %w2, %w2, #4 \n" // 4 processed per loop |
| 1021 "b.gt 1b \n" |
| 1022 : "+r"(dst_argb), // %0 |
| 1023 "+r"(src_argb), // %1 |
| 1024 "+r"(dst_width64), // %2 |
| 1025 "+r"(x64), // %3 |
| 1026 "+r"(dx64), // %4 |
| 1027 "+r"(tmp), // %5 |
| 1028 "+r"(src_tmp) // %6 |
| 1029 : |
| 1030 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", |
| 1031 "v6", "v7", "v16", "v17", "v18", "v19" |
| 1032 ); |
| 1033 } |
| 1034 |
| 1035 #undef LOAD2_DATA32_LANE |
| 1036 |
761 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 1037 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
762 | 1038 |
763 #ifdef __cplusplus | 1039 #ifdef __cplusplus |
764 } // extern "C" | 1040 } // extern "C" |
765 } // namespace libyuv | 1041 } // namespace libyuv |
766 #endif | 1042 #endif |
OLD | NEW |