OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
120 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ | 120 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ |
121 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ | 121 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ |
122 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ | 122 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ |
123 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ | 123 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ |
124 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ | 124 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ |
125 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ | 125 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ |
126 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ | 126 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ |
127 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ | 127 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ |
128 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ | 128 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ |
129 | 129 |
130 #ifdef HAS_I444TOARGBROW_NEON | |
131 void I444ToARGBRow_NEON(const uint8* src_y, | 130 void I444ToARGBRow_NEON(const uint8* src_y, |
132 const uint8* src_u, | 131 const uint8* src_u, |
133 const uint8* src_v, | 132 const uint8* src_v, |
134 uint8* dst_argb, | 133 uint8* dst_argb, |
135 const struct YuvConstants* yuvconstants, | 134 const struct YuvConstants* yuvconstants, |
136 int width) { | 135 int width) { |
137 asm volatile ( | 136 asm volatile ( |
138 YUVTORGB_SETUP | 137 YUVTORGB_SETUP |
139 "movi v23.8b, #255 \n" /* A */ | 138 "movi v23.8b, #255 \n" /* A */ |
140 "1: \n" | 139 "1: \n" |
141 READYUV444 | 140 READYUV444 |
142 YUVTORGB(v22, v21, v20) | 141 YUVTORGB(v22, v21, v20) |
143 "subs %w4, %w4, #8 \n" | 142 "subs %w4, %w4, #8 \n" |
144 MEMACCESS(3) | 143 MEMACCESS(3) |
145 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 144 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
146 "b.gt 1b \n" | 145 "b.gt 1b \n" |
147 : "+r"(src_y), // %0 | 146 : "+r"(src_y), // %0 |
148 "+r"(src_u), // %1 | 147 "+r"(src_u), // %1 |
149 "+r"(src_v), // %2 | 148 "+r"(src_v), // %2 |
150 "+r"(dst_argb), // %3 | 149 "+r"(dst_argb), // %3 |
151 "+r"(width) // %4 | 150 "+r"(width) // %4 |
152 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 151 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
153 [kUVToG]"r"(&yuvconstants->kUVToG), | 152 [kUVToG]"r"(&yuvconstants->kUVToG), |
154 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 153 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
155 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 154 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
156 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 155 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
157 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 156 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
158 ); | 157 ); |
159 } | 158 } |
160 #endif // HAS_I444TOARGBROW_NEON | |
161 | 159 |
162 #ifdef HAS_I422TOARGBROW_NEON | |
163 void I422ToARGBRow_NEON(const uint8* src_y, | 160 void I422ToARGBRow_NEON(const uint8* src_y, |
164 const uint8* src_u, | 161 const uint8* src_u, |
165 const uint8* src_v, | 162 const uint8* src_v, |
166 uint8* dst_argb, | 163 uint8* dst_argb, |
167 const struct YuvConstants* yuvconstants, | 164 const struct YuvConstants* yuvconstants, |
168 int width) { | 165 int width) { |
169 asm volatile ( | 166 asm volatile ( |
170 YUVTORGB_SETUP | 167 YUVTORGB_SETUP |
171 "movi v23.8b, #255 \n" /* A */ | 168 "movi v23.8b, #255 \n" /* A */ |
172 "1: \n" | 169 "1: \n" |
173 READYUV422 | 170 READYUV422 |
174 YUVTORGB(v22, v21, v20) | 171 YUVTORGB(v22, v21, v20) |
175 "subs %w4, %w4, #8 \n" | 172 "subs %w4, %w4, #8 \n" |
176 MEMACCESS(3) | 173 MEMACCESS(3) |
177 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 174 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
178 "b.gt 1b \n" | 175 "b.gt 1b \n" |
179 : "+r"(src_y), // %0 | 176 : "+r"(src_y), // %0 |
180 "+r"(src_u), // %1 | 177 "+r"(src_u), // %1 |
181 "+r"(src_v), // %2 | 178 "+r"(src_v), // %2 |
182 "+r"(dst_argb), // %3 | 179 "+r"(dst_argb), // %3 |
183 "+r"(width) // %4 | 180 "+r"(width) // %4 |
184 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 181 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
185 [kUVToG]"r"(&yuvconstants->kUVToG), | 182 [kUVToG]"r"(&yuvconstants->kUVToG), |
186 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 183 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
187 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 184 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
188 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 185 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
189 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 186 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
190 ); | 187 ); |
191 } | 188 } |
192 #endif // HAS_I422TOARGBROW_NEON | |
193 | 189 |
194 #ifdef HAS_I422ALPHATOARGBROW_NEON | |
195 void I422AlphaToARGBRow_NEON(const uint8* src_y, | 190 void I422AlphaToARGBRow_NEON(const uint8* src_y, |
196 const uint8* src_u, | 191 const uint8* src_u, |
197 const uint8* src_v, | 192 const uint8* src_v, |
198 const uint8* src_a, | 193 const uint8* src_a, |
199 uint8* dst_argb, | 194 uint8* dst_argb, |
200 const struct YuvConstants* yuvconstants, | 195 const struct YuvConstants* yuvconstants, |
201 int width) { | 196 int width) { |
202 asm volatile ( | 197 asm volatile ( |
203 YUVTORGB_SETUP | 198 YUVTORGB_SETUP |
204 "1: \n" | 199 "1: \n" |
(...skipping 12 matching lines...) Expand all Loading... |
217 "+r"(dst_argb), // %4 | 212 "+r"(dst_argb), // %4 |
218 "+r"(width) // %5 | 213 "+r"(width) // %5 |
219 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 214 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
220 [kUVToG]"r"(&yuvconstants->kUVToG), | 215 [kUVToG]"r"(&yuvconstants->kUVToG), |
221 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 216 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
222 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 217 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
223 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 218 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
224 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 219 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
225 ); | 220 ); |
226 } | 221 } |
227 #endif // HAS_I422ALPHATOARGBROW_NEON | |
228 | 222 |
229 #ifdef HAS_I411TOARGBROW_NEON | |
230 void I411ToARGBRow_NEON(const uint8* src_y, | 223 void I411ToARGBRow_NEON(const uint8* src_y, |
231 const uint8* src_u, | 224 const uint8* src_u, |
232 const uint8* src_v, | 225 const uint8* src_v, |
233 uint8* dst_argb, | 226 uint8* dst_argb, |
234 const struct YuvConstants* yuvconstants, | 227 const struct YuvConstants* yuvconstants, |
235 int width) { | 228 int width) { |
236 asm volatile ( | 229 asm volatile ( |
237 YUVTORGB_SETUP | 230 YUVTORGB_SETUP |
238 "movi v23.8b, #255 \n" /* A */ | 231 "movi v23.8b, #255 \n" /* A */ |
239 "1: \n" | 232 "1: \n" |
240 READYUV411 | 233 READYUV411 |
241 YUVTORGB(v22, v21, v20) | 234 YUVTORGB(v22, v21, v20) |
242 "subs %w4, %w4, #8 \n" | 235 "subs %w4, %w4, #8 \n" |
243 MEMACCESS(3) | 236 MEMACCESS(3) |
244 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 237 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
245 "b.gt 1b \n" | 238 "b.gt 1b \n" |
246 : "+r"(src_y), // %0 | 239 : "+r"(src_y), // %0 |
247 "+r"(src_u), // %1 | 240 "+r"(src_u), // %1 |
248 "+r"(src_v), // %2 | 241 "+r"(src_v), // %2 |
249 "+r"(dst_argb), // %3 | 242 "+r"(dst_argb), // %3 |
250 "+r"(width) // %4 | 243 "+r"(width) // %4 |
251 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 244 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
252 [kUVToG]"r"(&yuvconstants->kUVToG), | 245 [kUVToG]"r"(&yuvconstants->kUVToG), |
253 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 246 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
254 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 247 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
255 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 248 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
256 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 249 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
257 ); | 250 ); |
258 } | 251 } |
259 #endif // HAS_I411TOARGBROW_NEON | |
260 | 252 |
261 #ifdef HAS_I422TORGBAROW_NEON | |
262 void I422ToRGBARow_NEON(const uint8* src_y, | 253 void I422ToRGBARow_NEON(const uint8* src_y, |
263 const uint8* src_u, | 254 const uint8* src_u, |
264 const uint8* src_v, | 255 const uint8* src_v, |
265 uint8* dst_rgba, | 256 uint8* dst_rgba, |
266 const struct YuvConstants* yuvconstants, | 257 const struct YuvConstants* yuvconstants, |
267 int width) { | 258 int width) { |
268 asm volatile ( | 259 asm volatile ( |
269 YUVTORGB_SETUP | 260 YUVTORGB_SETUP |
270 "movi v20.8b, #255 \n" /* A */ | 261 "movi v20.8b, #255 \n" /* A */ |
271 "1: \n" | 262 "1: \n" |
272 READYUV422 | 263 READYUV422 |
273 YUVTORGB(v23, v22, v21) | 264 YUVTORGB(v23, v22, v21) |
274 "subs %w4, %w4, #8 \n" | 265 "subs %w4, %w4, #8 \n" |
275 MEMACCESS(3) | 266 MEMACCESS(3) |
276 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 267 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
277 "b.gt 1b \n" | 268 "b.gt 1b \n" |
278 : "+r"(src_y), // %0 | 269 : "+r"(src_y), // %0 |
279 "+r"(src_u), // %1 | 270 "+r"(src_u), // %1 |
280 "+r"(src_v), // %2 | 271 "+r"(src_v), // %2 |
281 "+r"(dst_rgba), // %3 | 272 "+r"(dst_rgba), // %3 |
282 "+r"(width) // %4 | 273 "+r"(width) // %4 |
283 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 274 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
284 [kUVToG]"r"(&yuvconstants->kUVToG), | 275 [kUVToG]"r"(&yuvconstants->kUVToG), |
285 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 276 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
286 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 277 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
287 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 278 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
288 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 279 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
289 ); | 280 ); |
290 } | 281 } |
291 #endif // HAS_I422TORGBAROW_NEON | |
292 | 282 |
293 #ifdef HAS_I422TORGB24ROW_NEON | |
294 void I422ToRGB24Row_NEON(const uint8* src_y, | 283 void I422ToRGB24Row_NEON(const uint8* src_y, |
295 const uint8* src_u, | 284 const uint8* src_u, |
296 const uint8* src_v, | 285 const uint8* src_v, |
297 uint8* dst_rgb24, | 286 uint8* dst_rgb24, |
298 const struct YuvConstants* yuvconstants, | 287 const struct YuvConstants* yuvconstants, |
299 int width) { | 288 int width) { |
300 asm volatile ( | 289 asm volatile ( |
301 YUVTORGB_SETUP | 290 YUVTORGB_SETUP |
302 "1: \n" | 291 "1: \n" |
303 READYUV422 | 292 READYUV422 |
304 YUVTORGB(v22, v21, v20) | 293 YUVTORGB(v22, v21, v20) |
305 "subs %w4, %w4, #8 \n" | 294 "subs %w4, %w4, #8 \n" |
306 MEMACCESS(3) | 295 MEMACCESS(3) |
307 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" | 296 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" |
308 "b.gt 1b \n" | 297 "b.gt 1b \n" |
309 : "+r"(src_y), // %0 | 298 : "+r"(src_y), // %0 |
310 "+r"(src_u), // %1 | 299 "+r"(src_u), // %1 |
311 "+r"(src_v), // %2 | 300 "+r"(src_v), // %2 |
312 "+r"(dst_rgb24), // %3 | 301 "+r"(dst_rgb24), // %3 |
313 "+r"(width) // %4 | 302 "+r"(width) // %4 |
314 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 303 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
315 [kUVToG]"r"(&yuvconstants->kUVToG), | 304 [kUVToG]"r"(&yuvconstants->kUVToG), |
316 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 305 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
317 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 306 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
318 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 307 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
319 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 308 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
320 ); | 309 ); |
321 } | 310 } |
322 #endif // HAS_I422TORGB24ROW_NEON | |
323 | 311 |
324 #define ARGBTORGB565 \ | 312 #define ARGBTORGB565 \ |
325 "shll v0.8h, v22.8b, #8 \n" /* R */ \ | 313 "shll v0.8h, v22.8b, #8 \n" /* R */ \ |
326 "shll v21.8h, v21.8b, #8 \n" /* G */ \ | 314 "shll v21.8h, v21.8b, #8 \n" /* G */ \ |
327 "shll v20.8h, v20.8b, #8 \n" /* B */ \ | 315 "shll v20.8h, v20.8b, #8 \n" /* B */ \ |
328 "sri v0.8h, v21.8h, #5 \n" /* RG */ \ | 316 "sri v0.8h, v21.8h, #5 \n" /* RG */ \ |
329 "sri v0.8h, v20.8h, #11 \n" /* RGB */ | 317 "sri v0.8h, v20.8h, #11 \n" /* RGB */ |
330 | 318 |
331 #ifdef HAS_I422TORGB565ROW_NEON | |
332 void I422ToRGB565Row_NEON(const uint8* src_y, | 319 void I422ToRGB565Row_NEON(const uint8* src_y, |
333 const uint8* src_u, | 320 const uint8* src_u, |
334 const uint8* src_v, | 321 const uint8* src_v, |
335 uint8* dst_rgb565, | 322 uint8* dst_rgb565, |
336 const struct YuvConstants* yuvconstants, | 323 const struct YuvConstants* yuvconstants, |
337 int width) { | 324 int width) { |
338 asm volatile ( | 325 asm volatile ( |
339 YUVTORGB_SETUP | 326 YUVTORGB_SETUP |
340 "1: \n" | 327 "1: \n" |
341 READYUV422 | 328 READYUV422 |
342 YUVTORGB(v22, v21, v20) | 329 YUVTORGB(v22, v21, v20) |
343 "subs %w4, %w4, #8 \n" | 330 "subs %w4, %w4, #8 \n" |
344 ARGBTORGB565 | 331 ARGBTORGB565 |
345 MEMACCESS(3) | 332 MEMACCESS(3) |
346 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. | 333 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. |
347 "b.gt 1b \n" | 334 "b.gt 1b \n" |
348 : "+r"(src_y), // %0 | 335 : "+r"(src_y), // %0 |
349 "+r"(src_u), // %1 | 336 "+r"(src_u), // %1 |
350 "+r"(src_v), // %2 | 337 "+r"(src_v), // %2 |
351 "+r"(dst_rgb565), // %3 | 338 "+r"(dst_rgb565), // %3 |
352 "+r"(width) // %4 | 339 "+r"(width) // %4 |
353 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 340 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
354 [kUVToG]"r"(&yuvconstants->kUVToG), | 341 [kUVToG]"r"(&yuvconstants->kUVToG), |
355 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 342 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
356 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 343 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
357 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 344 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
358 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 345 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
359 ); | 346 ); |
360 } | 347 } |
361 #endif // HAS_I422TORGB565ROW_NEON | |
362 | 348 |
363 #define ARGBTOARGB1555 \ | 349 #define ARGBTOARGB1555 \ |
364 "shll v0.8h, v23.8b, #8 \n" /* A */ \ | 350 "shll v0.8h, v23.8b, #8 \n" /* A */ \ |
365 "shll v22.8h, v22.8b, #8 \n" /* R */ \ | 351 "shll v22.8h, v22.8b, #8 \n" /* R */ \ |
366 "shll v21.8h, v21.8b, #8 \n" /* G */ \ | 352 "shll v21.8h, v21.8b, #8 \n" /* G */ \ |
367 "shll v20.8h, v20.8b, #8 \n" /* B */ \ | 353 "shll v20.8h, v20.8b, #8 \n" /* B */ \ |
368 "sri v0.8h, v22.8h, #1 \n" /* AR */ \ | 354 "sri v0.8h, v22.8h, #1 \n" /* AR */ \ |
369 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ | 355 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ |
370 "sri v0.8h, v20.8h, #11 \n" /* ARGB */ | 356 "sri v0.8h, v20.8h, #11 \n" /* ARGB */ |
371 | 357 |
372 #ifdef HAS_I422TOARGB1555ROW_NEON | |
373 void I422ToARGB1555Row_NEON(const uint8* src_y, | 358 void I422ToARGB1555Row_NEON(const uint8* src_y, |
374 const uint8* src_u, | 359 const uint8* src_u, |
375 const uint8* src_v, | 360 const uint8* src_v, |
376 uint8* dst_argb1555, | 361 uint8* dst_argb1555, |
377 const struct YuvConstants* yuvconstants, | 362 const struct YuvConstants* yuvconstants, |
378 int width) { | 363 int width) { |
379 asm volatile ( | 364 asm volatile ( |
380 YUVTORGB_SETUP | 365 YUVTORGB_SETUP |
381 "movi v23.8b, #255 \n" | 366 "movi v23.8b, #255 \n" |
382 "1: \n" | 367 "1: \n" |
(...skipping 10 matching lines...) Expand all Loading... |
393 "+r"(dst_argb1555), // %3 | 378 "+r"(dst_argb1555), // %3 |
394 "+r"(width) // %4 | 379 "+r"(width) // %4 |
395 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 380 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
396 [kUVToG]"r"(&yuvconstants->kUVToG), | 381 [kUVToG]"r"(&yuvconstants->kUVToG), |
397 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 382 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
398 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 383 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
399 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 384 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
400 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 385 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
401 ); | 386 ); |
402 } | 387 } |
403 #endif // HAS_I422TOARGB1555ROW_NEON | |
404 | 388 |
405 #define ARGBTOARGB4444 \ | 389 #define ARGBTOARGB4444 \ |
406 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ | 390 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ |
407 "ushr v20.8b, v20.8b, #4 \n" /* B */ \ | 391 "ushr v20.8b, v20.8b, #4 \n" /* B */ \ |
408 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ | 392 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ |
409 "ushr v22.8b, v22.8b, #4 \n" /* R */ \ | 393 "ushr v22.8b, v22.8b, #4 \n" /* R */ \ |
410 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ | 394 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ |
411 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ | 395 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ |
412 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ | 396 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ |
413 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ | 397 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ |
414 | 398 |
415 #ifdef HAS_I422TOARGB4444ROW_NEON | |
416 void I422ToARGB4444Row_NEON(const uint8* src_y, | 399 void I422ToARGB4444Row_NEON(const uint8* src_y, |
417 const uint8* src_u, | 400 const uint8* src_u, |
418 const uint8* src_v, | 401 const uint8* src_v, |
419 uint8* dst_argb4444, | 402 uint8* dst_argb4444, |
420 const struct YuvConstants* yuvconstants, | 403 const struct YuvConstants* yuvconstants, |
421 int width) { | 404 int width) { |
422 asm volatile ( | 405 asm volatile ( |
423 YUVTORGB_SETUP | 406 YUVTORGB_SETUP |
424 "movi v4.16b, #0x0f \n" // bits to clear with vbic. | 407 "movi v4.16b, #0x0f \n" // bits to clear with vbic. |
425 "1: \n" | 408 "1: \n" |
(...skipping 11 matching lines...) Expand all Loading... |
437 "+r"(dst_argb4444), // %3 | 420 "+r"(dst_argb4444), // %3 |
438 "+r"(width) // %4 | 421 "+r"(width) // %4 |
439 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 422 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
440 [kUVToG]"r"(&yuvconstants->kUVToG), | 423 [kUVToG]"r"(&yuvconstants->kUVToG), |
441 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 424 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
442 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 425 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
443 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 426 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
444 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 427 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
445 ); | 428 ); |
446 } | 429 } |
447 #endif // HAS_I422TOARGB4444ROW_NEON | |
448 | 430 |
449 #ifdef HAS_I400TOARGBROW_NEON | |
450 void I400ToARGBRow_NEON(const uint8* src_y, | 431 void I400ToARGBRow_NEON(const uint8* src_y, |
451 uint8* dst_argb, | 432 uint8* dst_argb, |
452 int width) { | 433 int width) { |
453 asm volatile ( | 434 asm volatile ( |
454 YUVTORGB_SETUP | 435 YUVTORGB_SETUP |
455 "movi v23.8b, #255 \n" | 436 "movi v23.8b, #255 \n" |
456 "1: \n" | 437 "1: \n" |
457 READYUV400 | 438 READYUV400 |
458 YUVTORGB(v22, v21, v20) | 439 YUVTORGB(v22, v21, v20) |
459 "subs %w2, %w2, #8 \n" | 440 "subs %w2, %w2, #8 \n" |
460 MEMACCESS(1) | 441 MEMACCESS(1) |
461 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 442 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
462 "b.gt 1b \n" | 443 "b.gt 1b \n" |
463 : "+r"(src_y), // %0 | 444 : "+r"(src_y), // %0 |
464 "+r"(dst_argb), // %1 | 445 "+r"(dst_argb), // %1 |
465 "+r"(width) // %2 | 446 "+r"(width) // %2 |
466 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), | 447 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), |
467 [kUVToG]"r"(&kYuvI601Constants.kUVToG), | 448 [kUVToG]"r"(&kYuvI601Constants.kUVToG), |
468 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), | 449 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), |
469 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) | 450 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) |
470 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 451 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
471 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 452 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
472 ); | 453 ); |
473 } | 454 } |
474 #endif // HAS_I400TOARGBROW_NEON | |
475 | 455 |
476 #ifdef HAS_J400TOARGBROW_NEON | |
477 void J400ToARGBRow_NEON(const uint8* src_y, | 456 void J400ToARGBRow_NEON(const uint8* src_y, |
478 uint8* dst_argb, | 457 uint8* dst_argb, |
479 int width) { | 458 int width) { |
480 asm volatile ( | 459 asm volatile ( |
481 "movi v23.8b, #255 \n" | 460 "movi v23.8b, #255 \n" |
482 "1: \n" | 461 "1: \n" |
483 MEMACCESS(0) | 462 MEMACCESS(0) |
484 "ld1 {v20.8b}, [%0], #8 \n" | 463 "ld1 {v20.8b}, [%0], #8 \n" |
485 "orr v21.8b, v20.8b, v20.8b \n" | 464 "orr v21.8b, v20.8b, v20.8b \n" |
486 "orr v22.8b, v20.8b, v20.8b \n" | 465 "orr v22.8b, v20.8b, v20.8b \n" |
487 "subs %w2, %w2, #8 \n" | 466 "subs %w2, %w2, #8 \n" |
488 MEMACCESS(1) | 467 MEMACCESS(1) |
489 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 468 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
490 "b.gt 1b \n" | 469 "b.gt 1b \n" |
491 : "+r"(src_y), // %0 | 470 : "+r"(src_y), // %0 |
492 "+r"(dst_argb), // %1 | 471 "+r"(dst_argb), // %1 |
493 "+r"(width) // %2 | 472 "+r"(width) // %2 |
494 : | 473 : |
495 : "cc", "memory", "v20", "v21", "v22", "v23" | 474 : "cc", "memory", "v20", "v21", "v22", "v23" |
496 ); | 475 ); |
497 } | 476 } |
498 #endif // HAS_J400TOARGBROW_NEON | |
499 | 477 |
500 #ifdef HAS_NV12TOARGBROW_NEON | |
501 void NV12ToARGBRow_NEON(const uint8* src_y, | 478 void NV12ToARGBRow_NEON(const uint8* src_y, |
502 const uint8* src_uv, | 479 const uint8* src_uv, |
503 uint8* dst_argb, | 480 uint8* dst_argb, |
504 const struct YuvConstants* yuvconstants, | 481 const struct YuvConstants* yuvconstants, |
505 int width) { | 482 int width) { |
506 asm volatile ( | 483 asm volatile ( |
507 YUVTORGB_SETUP | 484 YUVTORGB_SETUP |
508 "movi v23.8b, #255 \n" | 485 "movi v23.8b, #255 \n" |
509 "1: \n" | 486 "1: \n" |
510 READNV12 | 487 READNV12 |
511 YUVTORGB(v22, v21, v20) | 488 YUVTORGB(v22, v21, v20) |
512 "subs %w3, %w3, #8 \n" | 489 "subs %w3, %w3, #8 \n" |
513 MEMACCESS(2) | 490 MEMACCESS(2) |
514 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" | 491 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" |
515 "b.gt 1b \n" | 492 "b.gt 1b \n" |
516 : "+r"(src_y), // %0 | 493 : "+r"(src_y), // %0 |
517 "+r"(src_uv), // %1 | 494 "+r"(src_uv), // %1 |
518 "+r"(dst_argb), // %2 | 495 "+r"(dst_argb), // %2 |
519 "+r"(width) // %3 | 496 "+r"(width) // %3 |
520 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 497 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
521 [kUVToG]"r"(&yuvconstants->kUVToG), | 498 [kUVToG]"r"(&yuvconstants->kUVToG), |
522 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 499 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
523 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 500 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
524 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 501 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
525 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 502 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
526 ); | 503 ); |
527 } | 504 } |
528 #endif // HAS_NV12TOARGBROW_NEON | |
529 | 505 |
530 #ifdef HAS_NV12TOARGBROW_NEON | |
531 void NV21ToARGBRow_NEON(const uint8* src_y, | 506 void NV21ToARGBRow_NEON(const uint8* src_y, |
532 const uint8* src_vu, | 507 const uint8* src_vu, |
533 uint8* dst_argb, | 508 uint8* dst_argb, |
534 const struct YuvConstants* yuvconstants, | 509 const struct YuvConstants* yuvconstants, |
535 int width) { | 510 int width) { |
536 asm volatile ( | 511 asm volatile ( |
537 YUVTORGB_SETUP | 512 YUVTORGB_SETUP |
538 "movi v23.8b, #255 \n" | 513 "movi v23.8b, #255 \n" |
539 "1: \n" | 514 "1: \n" |
540 READNV21 | 515 READNV21 |
541 YUVTORGB(v22, v21, v20) | 516 YUVTORGB(v22, v21, v20) |
542 "subs %w3, %w3, #8 \n" | 517 "subs %w3, %w3, #8 \n" |
543 MEMACCESS(2) | 518 MEMACCESS(2) |
544 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" | 519 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" |
545 "b.gt 1b \n" | 520 "b.gt 1b \n" |
546 : "+r"(src_y), // %0 | 521 : "+r"(src_y), // %0 |
547 "+r"(src_vu), // %1 | 522 "+r"(src_vu), // %1 |
548 "+r"(dst_argb), // %2 | 523 "+r"(dst_argb), // %2 |
549 "+r"(width) // %3 | 524 "+r"(width) // %3 |
550 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 525 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
551 [kUVToG]"r"(&yuvconstants->kUVToG), | 526 [kUVToG]"r"(&yuvconstants->kUVToG), |
552 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 527 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
553 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 528 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
554 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 529 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
555 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 530 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
556 ); | 531 ); |
557 } | 532 } |
558 #endif // HAS_NV12TOARGBROW_NEON | |
559 | 533 |
560 #ifdef HAS_NV12TORGB565ROW_NEON | |
561 void NV12ToRGB565Row_NEON(const uint8* src_y, | 534 void NV12ToRGB565Row_NEON(const uint8* src_y, |
562 const uint8* src_uv, | 535 const uint8* src_uv, |
563 uint8* dst_rgb565, | 536 uint8* dst_rgb565, |
564 const struct YuvConstants* yuvconstants, | 537 const struct YuvConstants* yuvconstants, |
565 int width) { | 538 int width) { |
566 asm volatile ( | 539 asm volatile ( |
567 YUVTORGB_SETUP | 540 YUVTORGB_SETUP |
568 "1: \n" | 541 "1: \n" |
569 READNV12 | 542 READNV12 |
570 YUVTORGB(v22, v21, v20) | 543 YUVTORGB(v22, v21, v20) |
571 "subs %w3, %w3, #8 \n" | 544 "subs %w3, %w3, #8 \n" |
572 ARGBTORGB565 | 545 ARGBTORGB565 |
573 MEMACCESS(2) | 546 MEMACCESS(2) |
574 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. | 547 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. |
575 "b.gt 1b \n" | 548 "b.gt 1b \n" |
576 : "+r"(src_y), // %0 | 549 : "+r"(src_y), // %0 |
577 "+r"(src_uv), // %1 | 550 "+r"(src_uv), // %1 |
578 "+r"(dst_rgb565), // %2 | 551 "+r"(dst_rgb565), // %2 |
579 "+r"(width) // %3 | 552 "+r"(width) // %3 |
580 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 553 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
581 [kUVToG]"r"(&yuvconstants->kUVToG), | 554 [kUVToG]"r"(&yuvconstants->kUVToG), |
582 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 555 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
583 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 556 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
585 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
586 ); | 559 ); |
587 } | 560 } |
588 #endif // HAS_NV12TORGB565ROW_NEON | |
589 | 561 |
590 #ifdef HAS_YUY2TOARGBROW_NEON | |
591 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, | 562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, |
592 uint8* dst_argb, | 563 uint8* dst_argb, |
593 const struct YuvConstants* yuvconstants, | 564 const struct YuvConstants* yuvconstants, |
594 int width) { | 565 int width) { |
595 int64 width64 = (int64)(width); | 566 int64 width64 = (int64)(width); |
596 asm volatile ( | 567 asm volatile ( |
597 YUVTORGB_SETUP | 568 YUVTORGB_SETUP |
598 "movi v23.8b, #255 \n" | 569 "movi v23.8b, #255 \n" |
599 "1: \n" | 570 "1: \n" |
600 READYUY2 | 571 READYUY2 |
601 YUVTORGB(v22, v21, v20) | 572 YUVTORGB(v22, v21, v20) |
602 "subs %w2, %w2, #8 \n" | 573 "subs %w2, %w2, #8 \n" |
603 MEMACCESS(1) | 574 MEMACCESS(1) |
604 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 575 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
605 "b.gt 1b \n" | 576 "b.gt 1b \n" |
606 : "+r"(src_yuy2), // %0 | 577 : "+r"(src_yuy2), // %0 |
607 "+r"(dst_argb), // %1 | 578 "+r"(dst_argb), // %1 |
608 "+r"(width64) // %2 | 579 "+r"(width64) // %2 |
609 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 580 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
610 [kUVToG]"r"(&yuvconstants->kUVToG), | 581 [kUVToG]"r"(&yuvconstants->kUVToG), |
611 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 582 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
612 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 583 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
613 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
614 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 585 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
615 ); | 586 ); |
616 } | 587 } |
617 #endif // HAS_YUY2TOARGBROW_NEON | |
618 | 588 |
619 #ifdef HAS_UYVYTOARGBROW_NEON | |
620 void UYVYToARGBRow_NEON(const uint8* src_uyvy, | 589 void UYVYToARGBRow_NEON(const uint8* src_uyvy, |
621 uint8* dst_argb, | 590 uint8* dst_argb, |
622 const struct YuvConstants* yuvconstants, | 591 const struct YuvConstants* yuvconstants, |
623 int width) { | 592 int width) { |
624 int64 width64 = (int64)(width); | 593 int64 width64 = (int64)(width); |
625 asm volatile ( | 594 asm volatile ( |
626 YUVTORGB_SETUP | 595 YUVTORGB_SETUP |
627 "movi v23.8b, #255 \n" | 596 "movi v23.8b, #255 \n" |
628 "1: \n" | 597 "1: \n" |
629 READUYVY | 598 READUYVY |
630 YUVTORGB(v22, v21, v20) | 599 YUVTORGB(v22, v21, v20) |
631 "subs %w2, %w2, #8 \n" | 600 "subs %w2, %w2, #8 \n" |
632 MEMACCESS(1) | 601 MEMACCESS(1) |
633 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" | 602 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" |
634 "b.gt 1b \n" | 603 "b.gt 1b \n" |
635 : "+r"(src_uyvy), // %0 | 604 : "+r"(src_uyvy), // %0 |
636 "+r"(dst_argb), // %1 | 605 "+r"(dst_argb), // %1 |
637 "+r"(width64) // %2 | 606 "+r"(width64) // %2 |
638 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 607 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
639 [kUVToG]"r"(&yuvconstants->kUVToG), | 608 [kUVToG]"r"(&yuvconstants->kUVToG), |
640 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 609 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
641 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 610 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
642 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 611 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
643 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 612 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
644 ); | 613 ); |
645 } | 614 } |
646 #endif // HAS_UYVYTOARGBROW_NEON | |
647 | 615 |
648 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. | 616 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. |
649 #ifdef HAS_SPLITUVROW_NEON | |
650 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 617 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
651 int width) { | 618 int width) { |
652 asm volatile ( | 619 asm volatile ( |
653 "1: \n" | 620 "1: \n" |
654 MEMACCESS(0) | 621 MEMACCESS(0) |
655 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV | 622 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV |
656 "subs %w3, %w3, #16 \n" // 16 processed per loop | 623 "subs %w3, %w3, #16 \n" // 16 processed per loop |
657 MEMACCESS(1) | 624 MEMACCESS(1) |
658 "st1 {v0.16b}, [%1], #16 \n" // store U | 625 "st1 {v0.16b}, [%1], #16 \n" // store U |
659 MEMACCESS(2) | 626 MEMACCESS(2) |
660 "st1 {v1.16b}, [%2], #16 \n" // store V | 627 "st1 {v1.16b}, [%2], #16 \n" // store V |
661 "b.gt 1b \n" | 628 "b.gt 1b \n" |
662 : "+r"(src_uv), // %0 | 629 : "+r"(src_uv), // %0 |
663 "+r"(dst_u), // %1 | 630 "+r"(dst_u), // %1 |
664 "+r"(dst_v), // %2 | 631 "+r"(dst_v), // %2 |
665 "+r"(width) // %3 // Output registers | 632 "+r"(width) // %3 // Output registers |
666 : // Input registers | 633 : // Input registers |
667 : "cc", "memory", "v0", "v1" // Clobber List | 634 : "cc", "memory", "v0", "v1" // Clobber List |
668 ); | 635 ); |
669 } | 636 } |
670 #endif // HAS_SPLITUVROW_NEON | |
671 | 637 |
672 // Reads 16 U's and V's and writes out 16 pairs of UV. | 638 // Reads 16 U's and V's and writes out 16 pairs of UV. |
673 #ifdef HAS_MERGEUVROW_NEON | |
674 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 639 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
675 int width) { | 640 int width) { |
676 asm volatile ( | 641 asm volatile ( |
677 "1: \n" | 642 "1: \n" |
678 MEMACCESS(0) | 643 MEMACCESS(0) |
679 "ld1 {v0.16b}, [%0], #16 \n" // load U | 644 "ld1 {v0.16b}, [%0], #16 \n" // load U |
680 MEMACCESS(1) | 645 MEMACCESS(1) |
681 "ld1 {v1.16b}, [%1], #16 \n" // load V | 646 "ld1 {v1.16b}, [%1], #16 \n" // load V |
682 "subs %w3, %w3, #16 \n" // 16 processed per loop | 647 "subs %w3, %w3, #16 \n" // 16 processed per loop |
683 MEMACCESS(2) | 648 MEMACCESS(2) |
684 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV | 649 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV |
685 "b.gt 1b \n" | 650 "b.gt 1b \n" |
686 : | 651 : |
687 "+r"(src_u), // %0 | 652 "+r"(src_u), // %0 |
688 "+r"(src_v), // %1 | 653 "+r"(src_v), // %1 |
689 "+r"(dst_uv), // %2 | 654 "+r"(dst_uv), // %2 |
690 "+r"(width) // %3 // Output registers | 655 "+r"(width) // %3 // Output registers |
691 : // Input registers | 656 : // Input registers |
692 : "cc", "memory", "v0", "v1" // Clobber List | 657 : "cc", "memory", "v0", "v1" // Clobber List |
693 ); | 658 ); |
694 } | 659 } |
695 #endif // HAS_MERGEUVROW_NEON | |
696 | 660 |
697 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. | 661 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. |
698 #ifdef HAS_COPYROW_NEON | |
699 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { | 662 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { |
700 asm volatile ( | 663 asm volatile ( |
701 "1: \n" | 664 "1: \n" |
702 MEMACCESS(0) | 665 MEMACCESS(0) |
703 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 | 666 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 |
704 "subs %w2, %w2, #32 \n" // 32 processed per loop | 667 "subs %w2, %w2, #32 \n" // 32 processed per loop |
705 MEMACCESS(1) | 668 MEMACCESS(1) |
706 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 | 669 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 |
707 "b.gt 1b \n" | 670 "b.gt 1b \n" |
708 : "+r"(src), // %0 | 671 : "+r"(src), // %0 |
709 "+r"(dst), // %1 | 672 "+r"(dst), // %1 |
710 "+r"(count) // %2 // Output registers | 673 "+r"(count) // %2 // Output registers |
711 : // Input registers | 674 : // Input registers |
712 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 675 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
713 ); | 676 ); |
714 } | 677 } |
715 #endif // HAS_COPYROW_NEON | |
716 | 678 |
717 // SetRow writes 'count' bytes using an 8 bit value repeated. | 679 // SetRow writes 'count' bytes using an 8 bit value repeated. |
718 void SetRow_NEON(uint8* dst, uint8 v8, int count) { | 680 void SetRow_NEON(uint8* dst, uint8 v8, int count) { |
719 asm volatile ( | 681 asm volatile ( |
720 "dup v0.16b, %w2 \n" // duplicate 16 bytes | 682 "dup v0.16b, %w2 \n" // duplicate 16 bytes |
721 "1: \n" | 683 "1: \n" |
722 "subs %w1, %w1, #16 \n" // 16 bytes per loop | 684 "subs %w1, %w1, #16 \n" // 16 bytes per loop |
723 MEMACCESS(0) | 685 MEMACCESS(0) |
724 "st1 {v0.16b}, [%0], #16 \n" // store | 686 "st1 {v0.16b}, [%0], #16 \n" // store |
725 "b.gt 1b \n" | 687 "b.gt 1b \n" |
(...skipping 12 matching lines...) Expand all Loading... |
738 MEMACCESS(0) | 700 MEMACCESS(0) |
739 "st1 {v0.16b}, [%0], #16 \n" // store | 701 "st1 {v0.16b}, [%0], #16 \n" // store |
740 "b.gt 1b \n" | 702 "b.gt 1b \n" |
741 : "+r"(dst), // %0 | 703 : "+r"(dst), // %0 |
742 "+r"(count) // %1 | 704 "+r"(count) // %1 |
743 : "r"(v32) // %2 | 705 : "r"(v32) // %2 |
744 : "cc", "memory", "v0" | 706 : "cc", "memory", "v0" |
745 ); | 707 ); |
746 } | 708 } |
747 | 709 |
748 #ifdef HAS_MIRRORROW_NEON | |
749 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 710 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
750 int64 width64 = (int64) width; | 711 int64 width64 = (int64) width; |
751 asm volatile ( | 712 asm volatile ( |
752 // Start at end of source row. | 713 // Start at end of source row. |
753 "add %0, %0, %2 \n" | 714 "add %0, %0, %2 \n" |
754 "sub %0, %0, #16 \n" | 715 "sub %0, %0, #16 \n" |
755 | 716 |
756 "1: \n" | 717 "1: \n" |
757 MEMACCESS(0) | 718 MEMACCESS(0) |
758 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 | 719 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
759 "subs %2, %2, #16 \n" // 16 pixels per loop. | 720 "subs %2, %2, #16 \n" // 16 pixels per loop. |
760 "rev64 v0.16b, v0.16b \n" | 721 "rev64 v0.16b, v0.16b \n" |
761 MEMACCESS(1) | 722 MEMACCESS(1) |
762 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 | 723 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
763 MEMACCESS(1) | 724 MEMACCESS(1) |
764 "st1 {v0.D}[0], [%1], #8 \n" | 725 "st1 {v0.D}[0], [%1], #8 \n" |
765 "b.gt 1b \n" | 726 "b.gt 1b \n" |
766 : "+r"(src), // %0 | 727 : "+r"(src), // %0 |
767 "+r"(dst), // %1 | 728 "+r"(dst), // %1 |
768 "+r"(width64) // %2 | 729 "+r"(width64) // %2 |
769 : "r"((ptrdiff_t)-16) // %3 | 730 : "r"((ptrdiff_t)-16) // %3 |
770 : "cc", "memory", "v0" | 731 : "cc", "memory", "v0" |
771 ); | 732 ); |
772 } | 733 } |
773 #endif // HAS_MIRRORROW_NEON | |
774 | 734 |
775 #ifdef HAS_MIRRORUVROW_NEON | |
776 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 735 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
777 int width) { | 736 int width) { |
778 int64 width64 = (int64) width; | 737 int64 width64 = (int64) width; |
779 asm volatile ( | 738 asm volatile ( |
780 // Start at end of source row. | 739 // Start at end of source row. |
781 "add %0, %0, %3, lsl #1 \n" | 740 "add %0, %0, %3, lsl #1 \n" |
782 "sub %0, %0, #16 \n" | 741 "sub %0, %0, #16 \n" |
783 | 742 |
784 "1: \n" | 743 "1: \n" |
785 MEMACCESS(0) | 744 MEMACCESS(0) |
786 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 | 745 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 |
787 "subs %3, %3, #8 \n" // 8 pixels per loop. | 746 "subs %3, %3, #8 \n" // 8 pixels per loop. |
788 "rev64 v0.8b, v0.8b \n" | 747 "rev64 v0.8b, v0.8b \n" |
789 "rev64 v1.8b, v1.8b \n" | 748 "rev64 v1.8b, v1.8b \n" |
790 MEMACCESS(1) | 749 MEMACCESS(1) |
791 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 | 750 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 |
792 MEMACCESS(2) | 751 MEMACCESS(2) |
793 "st1 {v1.8b}, [%2], #8 \n" | 752 "st1 {v1.8b}, [%2], #8 \n" |
794 "b.gt 1b \n" | 753 "b.gt 1b \n" |
795 : "+r"(src_uv), // %0 | 754 : "+r"(src_uv), // %0 |
796 "+r"(dst_u), // %1 | 755 "+r"(dst_u), // %1 |
797 "+r"(dst_v), // %2 | 756 "+r"(dst_v), // %2 |
798 "+r"(width64) // %3 | 757 "+r"(width64) // %3 |
799 : "r"((ptrdiff_t)-16) // %4 | 758 : "r"((ptrdiff_t)-16) // %4 |
800 : "cc", "memory", "v0", "v1" | 759 : "cc", "memory", "v0", "v1" |
801 ); | 760 ); |
802 } | 761 } |
803 #endif // HAS_MIRRORUVROW_NEON | |
804 | 762 |
805 #ifdef HAS_ARGBMIRRORROW_NEON | |
806 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 763 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
807 int64 width64 = (int64) width; | 764 int64 width64 = (int64) width; |
808 asm volatile ( | 765 asm volatile ( |
809 // Start at end of source row. | 766 // Start at end of source row. |
810 "add %0, %0, %2, lsl #2 \n" | 767 "add %0, %0, %2, lsl #2 \n" |
811 "sub %0, %0, #16 \n" | 768 "sub %0, %0, #16 \n" |
812 | 769 |
813 "1: \n" | 770 "1: \n" |
814 MEMACCESS(0) | 771 MEMACCESS(0) |
815 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 | 772 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
816 "subs %2, %2, #4 \n" // 4 pixels per loop. | 773 "subs %2, %2, #4 \n" // 4 pixels per loop. |
817 "rev64 v0.4s, v0.4s \n" | 774 "rev64 v0.4s, v0.4s \n" |
818 MEMACCESS(1) | 775 MEMACCESS(1) |
819 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 | 776 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
820 MEMACCESS(1) | 777 MEMACCESS(1) |
821 "st1 {v0.D}[0], [%1], #8 \n" | 778 "st1 {v0.D}[0], [%1], #8 \n" |
822 "b.gt 1b \n" | 779 "b.gt 1b \n" |
823 : "+r"(src), // %0 | 780 : "+r"(src), // %0 |
824 "+r"(dst), // %1 | 781 "+r"(dst), // %1 |
825 "+r"(width64) // %2 | 782 "+r"(width64) // %2 |
826 : "r"((ptrdiff_t)-16) // %3 | 783 : "r"((ptrdiff_t)-16) // %3 |
827 : "cc", "memory", "v0" | 784 : "cc", "memory", "v0" |
828 ); | 785 ); |
829 } | 786 } |
830 #endif // HAS_ARGBMIRRORROW_NEON | |
831 | 787 |
832 #ifdef HAS_RGB24TOARGBROW_NEON | |
833 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { | 788 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { |
834 asm volatile ( | 789 asm volatile ( |
835 "movi v4.8b, #255 \n" // Alpha | 790 "movi v4.8b, #255 \n" // Alpha |
836 "1: \n" | 791 "1: \n" |
837 MEMACCESS(0) | 792 MEMACCESS(0) |
838 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. | 793 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. |
839 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 794 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
840 MEMACCESS(1) | 795 MEMACCESS(1) |
841 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels | 796 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels |
842 "b.gt 1b \n" | 797 "b.gt 1b \n" |
843 : "+r"(src_rgb24), // %0 | 798 : "+r"(src_rgb24), // %0 |
844 "+r"(dst_argb), // %1 | 799 "+r"(dst_argb), // %1 |
845 "+r"(width) // %2 | 800 "+r"(width) // %2 |
846 : | 801 : |
847 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 802 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
848 ); | 803 ); |
849 } | 804 } |
850 #endif // HAS_RGB24TOARGBROW_NEON | |
851 | 805 |
852 #ifdef HAS_RAWTOARGBROW_NEON | |
853 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { | 806 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { |
854 asm volatile ( | 807 asm volatile ( |
855 "movi v5.8b, #255 \n" // Alpha | 808 "movi v5.8b, #255 \n" // Alpha |
856 "1: \n" | 809 "1: \n" |
857 MEMACCESS(0) | 810 MEMACCESS(0) |
858 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 811 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
859 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 812 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
860 "orr v3.8b, v1.8b, v1.8b \n" // move g | 813 "orr v3.8b, v1.8b, v1.8b \n" // move g |
861 "orr v4.8b, v0.8b, v0.8b \n" // move r | 814 "orr v4.8b, v0.8b, v0.8b \n" // move r |
862 MEMACCESS(1) | 815 MEMACCESS(1) |
863 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a | 816 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a |
864 "b.gt 1b \n" | 817 "b.gt 1b \n" |
865 : "+r"(src_raw), // %0 | 818 : "+r"(src_raw), // %0 |
866 "+r"(dst_argb), // %1 | 819 "+r"(dst_argb), // %1 |
867 "+r"(width) // %2 | 820 "+r"(width) // %2 |
868 : | 821 : |
869 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List | 822 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
870 ); | 823 ); |
871 } | 824 } |
872 #endif // HAS_RAWTOARGBROW_NEON | |
873 | 825 |
874 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { | 826 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { |
875 asm volatile ( | 827 asm volatile ( |
876 "1: \n" | 828 "1: \n" |
877 MEMACCESS(0) | 829 MEMACCESS(0) |
878 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 830 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
879 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 831 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
880 "orr v3.8b, v1.8b, v1.8b \n" // move g | 832 "orr v3.8b, v1.8b, v1.8b \n" // move g |
881 "orr v4.8b, v0.8b, v0.8b \n" // move r | 833 "orr v4.8b, v0.8b, v0.8b \n" // move r |
882 MEMACCESS(1) | 834 MEMACCESS(1) |
(...skipping 13 matching lines...) Expand all Loading... |
896 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ | 848 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ |
897 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ | 849 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ |
898 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ | 850 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ |
899 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ | 851 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ |
900 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ | 852 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ |
901 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ | 853 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ |
902 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ | 854 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ |
903 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ | 855 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ |
904 "dup v2.2D, v0.D[1] \n" /* R */ | 856 "dup v2.2D, v0.D[1] \n" /* R */ |
905 | 857 |
906 #ifdef HAS_RGB565TOARGBROW_NEON | |
907 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { | 858 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { |
908 asm volatile ( | 859 asm volatile ( |
909 "movi v3.8b, #255 \n" // Alpha | 860 "movi v3.8b, #255 \n" // Alpha |
910 "1: \n" | 861 "1: \n" |
911 MEMACCESS(0) | 862 MEMACCESS(0) |
912 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. | 863 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. |
913 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 864 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
914 RGB565TOARGB | 865 RGB565TOARGB |
915 MEMACCESS(1) | 866 MEMACCESS(1) |
916 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 867 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
917 "b.gt 1b \n" | 868 "b.gt 1b \n" |
918 : "+r"(src_rgb565), // %0 | 869 : "+r"(src_rgb565), // %0 |
919 "+r"(dst_argb), // %1 | 870 "+r"(dst_argb), // %1 |
920 "+r"(width) // %2 | 871 "+r"(width) // %2 |
921 : | 872 : |
922 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List | 873 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List |
923 ); | 874 ); |
924 } | 875 } |
925 #endif // HAS_RGB565TOARGBROW_NEON | |
926 | 876 |
927 #define ARGB1555TOARGB \ | 877 #define ARGB1555TOARGB \ |
928 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ | 878 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ |
929 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ | 879 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ |
930 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ | 880 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ |
931 \ | 881 \ |
932 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ | 882 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ |
933 "xtn2 v3.16b, v2.8h \n" \ | 883 "xtn2 v3.16b, v2.8h \n" \ |
934 \ | 884 \ |
935 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ | 885 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ |
(...skipping 18 matching lines...) Expand all Loading... |
954 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ | 904 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ |
955 \ | 905 \ |
956 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ | 906 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ |
957 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ | 907 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ |
958 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ | 908 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ |
959 \ | 909 \ |
960 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ | 910 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ |
961 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ | 911 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ |
962 "dup v1.2D, v0.D[1] \n" /* G */ \ | 912 "dup v1.2D, v0.D[1] \n" /* G */ \ |
963 | 913 |
964 #ifdef HAS_ARGB1555TOARGBROW_NEON | |
965 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, | 914 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, |
966 int width) { | 915 int width) { |
967 asm volatile ( | 916 asm volatile ( |
968 "movi v3.8b, #255 \n" // Alpha | 917 "movi v3.8b, #255 \n" // Alpha |
969 "1: \n" | 918 "1: \n" |
970 MEMACCESS(0) | 919 MEMACCESS(0) |
971 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 920 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
972 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 921 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
973 ARGB1555TOARGB | 922 ARGB1555TOARGB |
974 MEMACCESS(1) | 923 MEMACCESS(1) |
975 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 924 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
976 "b.gt 1b \n" | 925 "b.gt 1b \n" |
977 : "+r"(src_argb1555), // %0 | 926 : "+r"(src_argb1555), // %0 |
978 "+r"(dst_argb), // %1 | 927 "+r"(dst_argb), // %1 |
979 "+r"(width) // %2 | 928 "+r"(width) // %2 |
980 : | 929 : |
981 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 930 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
982 ); | 931 ); |
983 } | 932 } |
984 #endif // HAS_ARGB1555TOARGBROW_NEON | |
985 | 933 |
986 #define ARGB4444TOARGB \ | 934 #define ARGB4444TOARGB \ |
987 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ | 935 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ |
988 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ | 936 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ |
989 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ | 937 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ |
990 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ | 938 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ |
991 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ | 939 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ |
992 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ | 940 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ |
993 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ | 941 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ |
994 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ | 942 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ |
995 "dup v0.2D, v2.D[1] \n" \ | 943 "dup v0.2D, v2.D[1] \n" \ |
996 "dup v1.2D, v3.D[1] \n" | 944 "dup v1.2D, v3.D[1] \n" |
997 | 945 |
998 #ifdef HAS_ARGB4444TOARGBROW_NEON | |
999 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, | 946 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, |
1000 int width) { | 947 int width) { |
1001 asm volatile ( | 948 asm volatile ( |
1002 "1: \n" | 949 "1: \n" |
1003 MEMACCESS(0) | 950 MEMACCESS(0) |
1004 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 951 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
1005 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 952 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1006 ARGB4444TOARGB | 953 ARGB4444TOARGB |
1007 MEMACCESS(1) | 954 MEMACCESS(1) |
1008 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 955 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
1009 "b.gt 1b \n" | 956 "b.gt 1b \n" |
1010 : "+r"(src_argb4444), // %0 | 957 : "+r"(src_argb4444), // %0 |
1011 "+r"(dst_argb), // %1 | 958 "+r"(dst_argb), // %1 |
1012 "+r"(width) // %2 | 959 "+r"(width) // %2 |
1013 : | 960 : |
1014 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List | 961 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List |
1015 ); | 962 ); |
1016 } | 963 } |
1017 #endif // HAS_ARGB4444TOARGBROW_NEON | |
1018 | 964 |
1019 #ifdef HAS_ARGBTORGB24ROW_NEON | |
1020 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { | 965 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { |
1021 asm volatile ( | 966 asm volatile ( |
1022 "1: \n" | 967 "1: \n" |
1023 MEMACCESS(0) | 968 MEMACCESS(0) |
1024 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels | 969 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels |
1025 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 970 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1026 MEMACCESS(1) | 971 MEMACCESS(1) |
1027 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. | 972 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. |
1028 "b.gt 1b \n" | 973 "b.gt 1b \n" |
1029 : "+r"(src_argb), // %0 | 974 : "+r"(src_argb), // %0 |
1030 "+r"(dst_rgb24), // %1 | 975 "+r"(dst_rgb24), // %1 |
1031 "+r"(width) // %2 | 976 "+r"(width) // %2 |
1032 : | 977 : |
1033 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 978 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
1034 ); | 979 ); |
1035 } | 980 } |
1036 #endif // HAS_ARGBTORGB24ROW_NEON | |
1037 | 981 |
1038 #ifdef HAS_ARGBTORAWROW_NEON | |
1039 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { | 982 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { |
1040 asm volatile ( | 983 asm volatile ( |
1041 "1: \n" | 984 "1: \n" |
1042 MEMACCESS(0) | 985 MEMACCESS(0) |
1043 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a | 986 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a |
1044 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 987 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1045 "orr v4.8b, v2.8b, v2.8b \n" // mov g | 988 "orr v4.8b, v2.8b, v2.8b \n" // mov g |
1046 "orr v5.8b, v1.8b, v1.8b \n" // mov b | 989 "orr v5.8b, v1.8b, v1.8b \n" // mov b |
1047 MEMACCESS(1) | 990 MEMACCESS(1) |
1048 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b | 991 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b |
1049 "b.gt 1b \n" | 992 "b.gt 1b \n" |
1050 : "+r"(src_argb), // %0 | 993 : "+r"(src_argb), // %0 |
1051 "+r"(dst_raw), // %1 | 994 "+r"(dst_raw), // %1 |
1052 "+r"(width) // %2 | 995 "+r"(width) // %2 |
1053 : | 996 : |
1054 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List | 997 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List |
1055 ); | 998 ); |
1056 } | 999 } |
1057 #endif // HAS_ARGBTORAWROW_NEON | |
1058 | 1000 |
1059 #ifdef HAS_YUY2TOYROW_NEON | |
1060 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { | 1001 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { |
1061 asm volatile ( | 1002 asm volatile ( |
1062 "1: \n" | 1003 "1: \n" |
1063 MEMACCESS(0) | 1004 MEMACCESS(0) |
1064 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. | 1005 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. |
1065 "subs %w2, %w2, #16 \n" // 16 processed per loop. | 1006 "subs %w2, %w2, #16 \n" // 16 processed per loop. |
1066 MEMACCESS(1) | 1007 MEMACCESS(1) |
1067 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. | 1008 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. |
1068 "b.gt 1b \n" | 1009 "b.gt 1b \n" |
1069 : "+r"(src_yuy2), // %0 | 1010 : "+r"(src_yuy2), // %0 |
1070 "+r"(dst_y), // %1 | 1011 "+r"(dst_y), // %1 |
1071 "+r"(width) // %2 | 1012 "+r"(width) // %2 |
1072 : | 1013 : |
1073 : "cc", "memory", "v0", "v1" // Clobber List | 1014 : "cc", "memory", "v0", "v1" // Clobber List |
1074 ); | 1015 ); |
1075 } | 1016 } |
1076 #endif // HAS_YUY2TOYROW_NEON | |
1077 | 1017 |
1078 #ifdef HAS_UYVYTOYROW_NEON | |
1079 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { | 1018 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { |
1080 asm volatile ( | 1019 asm volatile ( |
1081 "1: \n" | 1020 "1: \n" |
1082 MEMACCESS(0) | 1021 MEMACCESS(0) |
1083 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. | 1022 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. |
1084 "subs %w2, %w2, #16 \n" // 16 processed per loop. | 1023 "subs %w2, %w2, #16 \n" // 16 processed per loop. |
1085 MEMACCESS(1) | 1024 MEMACCESS(1) |
1086 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. | 1025 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. |
1087 "b.gt 1b \n" | 1026 "b.gt 1b \n" |
1088 : "+r"(src_uyvy), // %0 | 1027 : "+r"(src_uyvy), // %0 |
1089 "+r"(dst_y), // %1 | 1028 "+r"(dst_y), // %1 |
1090 "+r"(width) // %2 | 1029 "+r"(width) // %2 |
1091 : | 1030 : |
1092 : "cc", "memory", "v0", "v1" // Clobber List | 1031 : "cc", "memory", "v0", "v1" // Clobber List |
1093 ); | 1032 ); |
1094 } | 1033 } |
1095 #endif // HAS_UYVYTOYROW_NEON | |
1096 | 1034 |
1097 #ifdef HAS_YUY2TOUV422ROW_NEON | |
1098 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, | 1035 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, |
1099 int width) { | 1036 int width) { |
1100 asm volatile ( | 1037 asm volatile ( |
1101 "1: \n" | 1038 "1: \n" |
1102 MEMACCESS(0) | 1039 MEMACCESS(0) |
1103 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels | 1040 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels |
1104 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. | 1041 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
1105 MEMACCESS(1) | 1042 MEMACCESS(1) |
1106 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. | 1043 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. |
1107 MEMACCESS(2) | 1044 MEMACCESS(2) |
1108 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. | 1045 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. |
1109 "b.gt 1b \n" | 1046 "b.gt 1b \n" |
1110 : "+r"(src_yuy2), // %0 | 1047 : "+r"(src_yuy2), // %0 |
1111 "+r"(dst_u), // %1 | 1048 "+r"(dst_u), // %1 |
1112 "+r"(dst_v), // %2 | 1049 "+r"(dst_v), // %2 |
1113 "+r"(width) // %3 | 1050 "+r"(width) // %3 |
1114 : | 1051 : |
1115 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1052 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1116 ); | 1053 ); |
1117 } | 1054 } |
1118 #endif // HAS_YUY2TOUV422ROW_NEON | |
1119 | 1055 |
1120 #ifdef HAS_UYVYTOUV422ROW_NEON | |
1121 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, | 1056 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, |
1122 int width) { | 1057 int width) { |
1123 asm volatile ( | 1058 asm volatile ( |
1124 "1: \n" | 1059 "1: \n" |
1125 MEMACCESS(0) | 1060 MEMACCESS(0) |
1126 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels | 1061 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels |
1127 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. | 1062 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
1128 MEMACCESS(1) | 1063 MEMACCESS(1) |
1129 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. | 1064 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. |
1130 MEMACCESS(2) | 1065 MEMACCESS(2) |
1131 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. | 1066 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. |
1132 "b.gt 1b \n" | 1067 "b.gt 1b \n" |
1133 : "+r"(src_uyvy), // %0 | 1068 : "+r"(src_uyvy), // %0 |
1134 "+r"(dst_u), // %1 | 1069 "+r"(dst_u), // %1 |
1135 "+r"(dst_v), // %2 | 1070 "+r"(dst_v), // %2 |
1136 "+r"(width) // %3 | 1071 "+r"(width) // %3 |
1137 : | 1072 : |
1138 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1073 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1139 ); | 1074 ); |
1140 } | 1075 } |
1141 #endif // HAS_UYVYTOUV422ROW_NEON | |
1142 | 1076 |
1143 #ifdef HAS_YUY2TOUVROW_NEON | |
1144 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, | 1077 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, |
1145 uint8* dst_u, uint8* dst_v, int width) { | 1078 uint8* dst_u, uint8* dst_v, int width) { |
1146 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; | 1079 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; |
1147 asm volatile ( | 1080 asm volatile ( |
1148 "1: \n" | 1081 "1: \n" |
1149 MEMACCESS(0) | 1082 MEMACCESS(0) |
1150 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels | 1083 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
1151 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. | 1084 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
1152 MEMACCESS(1) | 1085 MEMACCESS(1) |
1153 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row | 1086 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
1154 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U | 1087 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U |
1155 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V | 1088 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V |
1156 MEMACCESS(2) | 1089 MEMACCESS(2) |
1157 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. | 1090 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. |
1158 MEMACCESS(3) | 1091 MEMACCESS(3) |
1159 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. | 1092 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. |
1160 "b.gt 1b \n" | 1093 "b.gt 1b \n" |
1161 : "+r"(src_yuy2), // %0 | 1094 : "+r"(src_yuy2), // %0 |
1162 "+r"(src_yuy2b), // %1 | 1095 "+r"(src_yuy2b), // %1 |
1163 "+r"(dst_u), // %2 | 1096 "+r"(dst_u), // %2 |
1164 "+r"(dst_v), // %3 | 1097 "+r"(dst_v), // %3 |
1165 "+r"(width) // %4 | 1098 "+r"(width) // %4 |
1166 : | 1099 : |
1167 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1100 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
1168 "v5", "v6", "v7" // Clobber List | 1101 "v5", "v6", "v7" // Clobber List |
1169 ); | 1102 ); |
1170 } | 1103 } |
1171 #endif // HAS_YUY2TOUVROW_NEON | |
1172 | 1104 |
1173 #ifdef HAS_UYVYTOUVROW_NEON | |
1174 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, | 1105 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, |
1175 uint8* dst_u, uint8* dst_v, int width) { | 1106 uint8* dst_u, uint8* dst_v, int width) { |
1176 const uint8* src_uyvyb = src_uyvy + stride_uyvy; | 1107 const uint8* src_uyvyb = src_uyvy + stride_uyvy; |
1177 asm volatile ( | 1108 asm volatile ( |
1178 "1: \n" | 1109 "1: \n" |
1179 MEMACCESS(0) | 1110 MEMACCESS(0) |
1180 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels | 1111 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
1181 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. | 1112 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
1182 MEMACCESS(1) | 1113 MEMACCESS(1) |
1183 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row | 1114 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
1184 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U | 1115 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U |
1185 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V | 1116 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V |
1186 MEMACCESS(2) | 1117 MEMACCESS(2) |
1187 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. | 1118 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. |
1188 MEMACCESS(3) | 1119 MEMACCESS(3) |
1189 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. | 1120 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. |
1190 "b.gt 1b \n" | 1121 "b.gt 1b \n" |
1191 : "+r"(src_uyvy), // %0 | 1122 : "+r"(src_uyvy), // %0 |
1192 "+r"(src_uyvyb), // %1 | 1123 "+r"(src_uyvyb), // %1 |
1193 "+r"(dst_u), // %2 | 1124 "+r"(dst_u), // %2 |
1194 "+r"(dst_v), // %3 | 1125 "+r"(dst_v), // %3 |
1195 "+r"(width) // %4 | 1126 "+r"(width) // %4 |
1196 : | 1127 : |
1197 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1128 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
1198 "v5", "v6", "v7" // Clobber List | 1129 "v5", "v6", "v7" // Clobber List |
1199 ); | 1130 ); |
1200 } | 1131 } |
1201 #endif // HAS_UYVYTOUVROW_NEON | |
1202 | 1132 |
1203 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 1133 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
1204 #ifdef HAS_ARGBSHUFFLEROW_NEON | |
1205 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, | 1134 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, |
1206 const uint8* shuffler, int width) { | 1135 const uint8* shuffler, int width) { |
1207 asm volatile ( | 1136 asm volatile ( |
1208 MEMACCESS(3) | 1137 MEMACCESS(3) |
1209 "ld1 {v2.16b}, [%3] \n" // shuffler | 1138 "ld1 {v2.16b}, [%3] \n" // shuffler |
1210 "1: \n" | 1139 "1: \n" |
1211 MEMACCESS(0) | 1140 MEMACCESS(0) |
1212 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. | 1141 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. |
1213 "subs %w2, %w2, #4 \n" // 4 processed per loop | 1142 "subs %w2, %w2, #4 \n" // 4 processed per loop |
1214 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels | 1143 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels |
1215 MEMACCESS(1) | 1144 MEMACCESS(1) |
1216 "st1 {v1.16b}, [%1], #16 \n" // store 4. | 1145 "st1 {v1.16b}, [%1], #16 \n" // store 4. |
1217 "b.gt 1b \n" | 1146 "b.gt 1b \n" |
1218 : "+r"(src_argb), // %0 | 1147 : "+r"(src_argb), // %0 |
1219 "+r"(dst_argb), // %1 | 1148 "+r"(dst_argb), // %1 |
1220 "+r"(width) // %2 | 1149 "+r"(width) // %2 |
1221 : "r"(shuffler) // %3 | 1150 : "r"(shuffler) // %3 |
1222 : "cc", "memory", "v0", "v1", "v2" // Clobber List | 1151 : "cc", "memory", "v0", "v1", "v2" // Clobber List |
1223 ); | 1152 ); |
1224 } | 1153 } |
1225 #endif // HAS_ARGBSHUFFLEROW_NEON | |
1226 | 1154 |
1227 #ifdef HAS_I422TOYUY2ROW_NEON | |
1228 void I422ToYUY2Row_NEON(const uint8* src_y, | 1155 void I422ToYUY2Row_NEON(const uint8* src_y, |
1229 const uint8* src_u, | 1156 const uint8* src_u, |
1230 const uint8* src_v, | 1157 const uint8* src_v, |
1231 uint8* dst_yuy2, int width) { | 1158 uint8* dst_yuy2, int width) { |
1232 asm volatile ( | 1159 asm volatile ( |
1233 "1: \n" | 1160 "1: \n" |
1234 MEMACCESS(0) | 1161 MEMACCESS(0) |
1235 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys | 1162 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys |
1236 "orr v2.8b, v1.8b, v1.8b \n" | 1163 "orr v2.8b, v1.8b, v1.8b \n" |
1237 MEMACCESS(1) | 1164 MEMACCESS(1) |
1238 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us | 1165 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us |
1239 MEMACCESS(2) | 1166 MEMACCESS(2) |
1240 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs | 1167 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs |
1241 "subs %w4, %w4, #16 \n" // 16 pixels | 1168 "subs %w4, %w4, #16 \n" // 16 pixels |
1242 MEMACCESS(3) | 1169 MEMACCESS(3) |
1243 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. | 1170 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. |
1244 "b.gt 1b \n" | 1171 "b.gt 1b \n" |
1245 : "+r"(src_y), // %0 | 1172 : "+r"(src_y), // %0 |
1246 "+r"(src_u), // %1 | 1173 "+r"(src_u), // %1 |
1247 "+r"(src_v), // %2 | 1174 "+r"(src_v), // %2 |
1248 "+r"(dst_yuy2), // %3 | 1175 "+r"(dst_yuy2), // %3 |
1249 "+r"(width) // %4 | 1176 "+r"(width) // %4 |
1250 : | 1177 : |
1251 : "cc", "memory", "v0", "v1", "v2", "v3" | 1178 : "cc", "memory", "v0", "v1", "v2", "v3" |
1252 ); | 1179 ); |
1253 } | 1180 } |
1254 #endif // HAS_I422TOYUY2ROW_NEON | |
1255 | 1181 |
1256 #ifdef HAS_I422TOUYVYROW_NEON | |
1257 void I422ToUYVYRow_NEON(const uint8* src_y, | 1182 void I422ToUYVYRow_NEON(const uint8* src_y, |
1258 const uint8* src_u, | 1183 const uint8* src_u, |
1259 const uint8* src_v, | 1184 const uint8* src_v, |
1260 uint8* dst_uyvy, int width) { | 1185 uint8* dst_uyvy, int width) { |
1261 asm volatile ( | 1186 asm volatile ( |
1262 "1: \n" | 1187 "1: \n" |
1263 MEMACCESS(0) | 1188 MEMACCESS(0) |
1264 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys | 1189 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys |
1265 "orr v3.8b, v2.8b, v2.8b \n" | 1190 "orr v3.8b, v2.8b, v2.8b \n" |
1266 MEMACCESS(1) | 1191 MEMACCESS(1) |
1267 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us | 1192 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us |
1268 MEMACCESS(2) | 1193 MEMACCESS(2) |
1269 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs | 1194 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs |
1270 "subs %w4, %w4, #16 \n" // 16 pixels | 1195 "subs %w4, %w4, #16 \n" // 16 pixels |
1271 MEMACCESS(3) | 1196 MEMACCESS(3) |
1272 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. | 1197 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. |
1273 "b.gt 1b \n" | 1198 "b.gt 1b \n" |
1274 : "+r"(src_y), // %0 | 1199 : "+r"(src_y), // %0 |
1275 "+r"(src_u), // %1 | 1200 "+r"(src_u), // %1 |
1276 "+r"(src_v), // %2 | 1201 "+r"(src_v), // %2 |
1277 "+r"(dst_uyvy), // %3 | 1202 "+r"(dst_uyvy), // %3 |
1278 "+r"(width) // %4 | 1203 "+r"(width) // %4 |
1279 : | 1204 : |
1280 : "cc", "memory", "v0", "v1", "v2", "v3" | 1205 : "cc", "memory", "v0", "v1", "v2", "v3" |
1281 ); | 1206 ); |
1282 } | 1207 } |
1283 #endif // HAS_I422TOUYVYROW_NEON | |
1284 | 1208 |
1285 #ifdef HAS_ARGBTORGB565ROW_NEON | |
1286 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { | 1209 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { |
1287 asm volatile ( | 1210 asm volatile ( |
1288 "1: \n" | 1211 "1: \n" |
1289 MEMACCESS(0) | 1212 MEMACCESS(0) |
1290 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1213 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
1291 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1214 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1292 ARGBTORGB565 | 1215 ARGBTORGB565 |
1293 MEMACCESS(1) | 1216 MEMACCESS(1) |
1294 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. | 1217 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. |
1295 "b.gt 1b \n" | 1218 "b.gt 1b \n" |
1296 : "+r"(src_argb), // %0 | 1219 : "+r"(src_argb), // %0 |
1297 "+r"(dst_rgb565), // %1 | 1220 "+r"(dst_rgb565), // %1 |
1298 "+r"(width) // %2 | 1221 "+r"(width) // %2 |
1299 : | 1222 : |
1300 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 1223 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" |
1301 ); | 1224 ); |
1302 } | 1225 } |
1303 #endif // HAS_ARGBTORGB565ROW_NEON | |
1304 | 1226 |
1305 #ifdef HAS_ARGBTORGB565DITHERROW_NEON | |
1306 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, | 1227 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, |
1307 const uint32 dither4, int width) { | 1228 const uint32 dither4, int width) { |
1308 asm volatile ( | 1229 asm volatile ( |
1309 "dup v1.4s, %w2 \n" // dither4 | 1230 "dup v1.4s, %w2 \n" // dither4 |
1310 "1: \n" | 1231 "1: \n" |
1311 MEMACCESS(1) | 1232 MEMACCESS(1) |
1312 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels | 1233 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels |
1313 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 1234 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
1314 "uqadd v20.8b, v20.8b, v1.8b \n" | 1235 "uqadd v20.8b, v20.8b, v1.8b \n" |
1315 "uqadd v21.8b, v21.8b, v1.8b \n" | 1236 "uqadd v21.8b, v21.8b, v1.8b \n" |
1316 "uqadd v22.8b, v22.8b, v1.8b \n" | 1237 "uqadd v22.8b, v22.8b, v1.8b \n" |
1317 ARGBTORGB565 | 1238 ARGBTORGB565 |
1318 MEMACCESS(0) | 1239 MEMACCESS(0) |
1319 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. | 1240 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. |
1320 "b.gt 1b \n" | 1241 "b.gt 1b \n" |
1321 : "+r"(dst_rgb) // %0 | 1242 : "+r"(dst_rgb) // %0 |
1322 : "r"(src_argb), // %1 | 1243 : "r"(src_argb), // %1 |
1323 "r"(dither4), // %2 | 1244 "r"(dither4), // %2 |
1324 "r"(width) // %3 | 1245 "r"(width) // %3 |
1325 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" | 1246 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" |
1326 ); | 1247 ); |
1327 } | 1248 } |
1328 #endif // HAS_ARGBTORGB565ROW_NEON | |
1329 | 1249 |
1330 #ifdef HAS_ARGBTOARGB1555ROW_NEON | |
1331 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, | 1250 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, |
1332 int width) { | 1251 int width) { |
1333 asm volatile ( | 1252 asm volatile ( |
1334 "1: \n" | 1253 "1: \n" |
1335 MEMACCESS(0) | 1254 MEMACCESS(0) |
1336 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1255 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
1337 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1256 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1338 ARGBTOARGB1555 | 1257 ARGBTOARGB1555 |
1339 MEMACCESS(1) | 1258 MEMACCESS(1) |
1340 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. | 1259 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. |
1341 "b.gt 1b \n" | 1260 "b.gt 1b \n" |
1342 : "+r"(src_argb), // %0 | 1261 : "+r"(src_argb), // %0 |
1343 "+r"(dst_argb1555), // %1 | 1262 "+r"(dst_argb1555), // %1 |
1344 "+r"(width) // %2 | 1263 "+r"(width) // %2 |
1345 : | 1264 : |
1346 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 1265 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" |
1347 ); | 1266 ); |
1348 } | 1267 } |
1349 #endif // HAS_ARGBTOARGB1555ROW_NEON | |
1350 | 1268 |
1351 #ifdef HAS_ARGBTOARGB4444ROW_NEON | |
1352 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, | 1269 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, |
1353 int width) { | 1270 int width) { |
1354 asm volatile ( | 1271 asm volatile ( |
1355 "movi v4.16b, #0x0f \n" // bits to clear with vbic. | 1272 "movi v4.16b, #0x0f \n" // bits to clear with vbic. |
1356 "1: \n" | 1273 "1: \n" |
1357 MEMACCESS(0) | 1274 MEMACCESS(0) |
1358 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1275 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
1359 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1276 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1360 ARGBTOARGB4444 | 1277 ARGBTOARGB4444 |
1361 MEMACCESS(1) | 1278 MEMACCESS(1) |
1362 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. | 1279 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. |
1363 "b.gt 1b \n" | 1280 "b.gt 1b \n" |
1364 : "+r"(src_argb), // %0 | 1281 : "+r"(src_argb), // %0 |
1365 "+r"(dst_argb4444), // %1 | 1282 "+r"(dst_argb4444), // %1 |
1366 "+r"(width) // %2 | 1283 "+r"(width) // %2 |
1367 : | 1284 : |
1368 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" | 1285 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" |
1369 ); | 1286 ); |
1370 } | 1287 } |
1371 #endif // HAS_ARGBTOARGB4444ROW_NEON | |
1372 | 1288 |
1373 #ifdef HAS_ARGBTOYROW_NEON | |
1374 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { | 1289 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { |
1375 asm volatile ( | 1290 asm volatile ( |
1376 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 1291 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
1377 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 1292 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
1378 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 1293 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
1379 "movi v7.8b, #16 \n" // Add 16 constant | 1294 "movi v7.8b, #16 \n" // Add 16 constant |
1380 "1: \n" | 1295 "1: \n" |
1381 MEMACCESS(0) | 1296 MEMACCESS(0) |
1382 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1297 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
1383 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1298 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1384 "umull v3.8h, v0.8b, v4.8b \n" // B | 1299 "umull v3.8h, v0.8b, v4.8b \n" // B |
1385 "umlal v3.8h, v1.8b, v5.8b \n" // G | 1300 "umlal v3.8h, v1.8b, v5.8b \n" // G |
1386 "umlal v3.8h, v2.8b, v6.8b \n" // R | 1301 "umlal v3.8h, v2.8b, v6.8b \n" // R |
1387 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 1302 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
1388 "uqadd v0.8b, v0.8b, v7.8b \n" | 1303 "uqadd v0.8b, v0.8b, v7.8b \n" |
1389 MEMACCESS(1) | 1304 MEMACCESS(1) |
1390 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1305 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
1391 "b.gt 1b \n" | 1306 "b.gt 1b \n" |
1392 : "+r"(src_argb), // %0 | 1307 : "+r"(src_argb), // %0 |
1393 "+r"(dst_y), // %1 | 1308 "+r"(dst_y), // %1 |
1394 "+r"(width) // %2 | 1309 "+r"(width) // %2 |
1395 : | 1310 : |
1396 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 1311 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
1397 ); | 1312 ); |
1398 } | 1313 } |
1399 #endif // HAS_ARGBTOYROW_NEON | |
1400 | 1314 |
1401 #ifdef HAS_ARGBEXTRACTALPHAROW_NEON | |
1402 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { | 1315 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { |
1403 asm volatile ( | 1316 asm volatile ( |
1404 "1: \n" | 1317 "1: \n" |
1405 MEMACCESS(0) | 1318 MEMACCESS(0) |
1406 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pix
els | 1319 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pix
els |
1407 "subs %w2, %w2, #16 \n" // 16 processed per loop | 1320 "subs %w2, %w2, #16 \n" // 16 processed per loop |
1408 MEMACCESS(1) | 1321 MEMACCESS(1) |
1409 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. | 1322 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. |
1410 "b.gt 1b \n" | 1323 "b.gt 1b \n" |
1411 : "+r"(src_argb), // %0 | 1324 : "+r"(src_argb), // %0 |
1412 "+r"(dst_a), // %1 | 1325 "+r"(dst_a), // %1 |
1413 "+r"(width) // %2 | 1326 "+r"(width) // %2 |
1414 : | 1327 : |
1415 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1328 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1416 ); | 1329 ); |
1417 } | 1330 } |
1418 #endif // HAS_ARGBEXTRACTALPHAROW_NEON | |
1419 | 1331 |
1420 #ifdef HAS_ARGBTOYJROW_NEON | |
1421 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { | 1332 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { |
1422 asm volatile ( | 1333 asm volatile ( |
1423 "movi v4.8b, #15 \n" // B * 0.11400 coefficient | 1334 "movi v4.8b, #15 \n" // B * 0.11400 coefficient |
1424 "movi v5.8b, #75 \n" // G * 0.58700 coefficient | 1335 "movi v5.8b, #75 \n" // G * 0.58700 coefficient |
1425 "movi v6.8b, #38 \n" // R * 0.29900 coefficient | 1336 "movi v6.8b, #38 \n" // R * 0.29900 coefficient |
1426 "1: \n" | 1337 "1: \n" |
1427 MEMACCESS(0) | 1338 MEMACCESS(0) |
1428 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1339 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
1429 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1340 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1430 "umull v3.8h, v0.8b, v4.8b \n" // B | 1341 "umull v3.8h, v0.8b, v4.8b \n" // B |
1431 "umlal v3.8h, v1.8b, v5.8b \n" // G | 1342 "umlal v3.8h, v1.8b, v5.8b \n" // G |
1432 "umlal v3.8h, v2.8b, v6.8b \n" // R | 1343 "umlal v3.8h, v2.8b, v6.8b \n" // R |
1433 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y | 1344 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y |
1434 MEMACCESS(1) | 1345 MEMACCESS(1) |
1435 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1346 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
1436 "b.gt 1b \n" | 1347 "b.gt 1b \n" |
1437 : "+r"(src_argb), // %0 | 1348 : "+r"(src_argb), // %0 |
1438 "+r"(dst_y), // %1 | 1349 "+r"(dst_y), // %1 |
1439 "+r"(width) // %2 | 1350 "+r"(width) // %2 |
1440 : | 1351 : |
1441 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 1352 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |
1442 ); | 1353 ); |
1443 } | 1354 } |
1444 #endif // HAS_ARGBTOYJROW_NEON | |
1445 | 1355 |
1446 // 8x1 pixels. | 1356 // 8x1 pixels. |
1447 #ifdef HAS_ARGBTOUV444ROW_NEON | |
1448 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1357 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
1449 int width) { | 1358 int width) { |
1450 asm volatile ( | 1359 asm volatile ( |
1451 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient | 1360 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient |
1452 "movi v25.8b, #74 \n" // UG -0.5781 coefficient | 1361 "movi v25.8b, #74 \n" // UG -0.5781 coefficient |
1453 "movi v26.8b, #38 \n" // UR -0.2969 coefficient | 1362 "movi v26.8b, #38 \n" // UR -0.2969 coefficient |
1454 "movi v27.8b, #18 \n" // VB -0.1406 coefficient | 1363 "movi v27.8b, #18 \n" // VB -0.1406 coefficient |
1455 "movi v28.8b, #94 \n" // VG -0.7344 coefficient | 1364 "movi v28.8b, #94 \n" // VG -0.7344 coefficient |
1456 "movi v29.16b,#0x80 \n" // 128.5 | 1365 "movi v29.16b,#0x80 \n" // 128.5 |
1457 "1: \n" | 1366 "1: \n" |
(...skipping 20 matching lines...) Expand all Loading... |
1478 "b.gt 1b \n" | 1387 "b.gt 1b \n" |
1479 : "+r"(src_argb), // %0 | 1388 : "+r"(src_argb), // %0 |
1480 "+r"(dst_u), // %1 | 1389 "+r"(dst_u), // %1 |
1481 "+r"(dst_v), // %2 | 1390 "+r"(dst_v), // %2 |
1482 "+r"(width) // %3 | 1391 "+r"(width) // %3 |
1483 : | 1392 : |
1484 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1393 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
1485 "v24", "v25", "v26", "v27", "v28", "v29" | 1394 "v24", "v25", "v26", "v27", "v28", "v29" |
1486 ); | 1395 ); |
1487 } | 1396 } |
1488 #endif // HAS_ARGBTOUV444ROW_NEON | |
1489 | 1397 |
1490 #define RGBTOUV_SETUP_REG \ | 1398 #define RGBTOUV_SETUP_REG \ |
1491 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ | 1399 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ |
1492 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ | 1400 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ |
1493 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ | 1401 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ |
1494 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ | 1402 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ |
1495 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ | 1403 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ |
1496 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ | 1404 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ |
1497 | 1405 |
1498 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. | 1406 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. |
1499 #ifdef HAS_ARGBTOUV411ROW_NEON | |
1500 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1407 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
1501 int width) { | 1408 int width) { |
1502 asm volatile ( | 1409 asm volatile ( |
1503 RGBTOUV_SETUP_REG | 1410 RGBTOUV_SETUP_REG |
1504 "1: \n" | 1411 "1: \n" |
1505 MEMACCESS(0) | 1412 MEMACCESS(0) |
1506 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1413 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1507 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1414 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
1508 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1415 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
1509 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1416 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
(...skipping 29 matching lines...) Expand all Loading... |
1539 "b.gt 1b \n" | 1446 "b.gt 1b \n" |
1540 : "+r"(src_argb), // %0 | 1447 : "+r"(src_argb), // %0 |
1541 "+r"(dst_u), // %1 | 1448 "+r"(dst_u), // %1 |
1542 "+r"(dst_v), // %2 | 1449 "+r"(dst_v), // %2 |
1543 "+r"(width) // %3 | 1450 "+r"(width) // %3 |
1544 : | 1451 : |
1545 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1452 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1546 "v20", "v21", "v22", "v23", "v24", "v25" | 1453 "v20", "v21", "v22", "v23", "v24", "v25" |
1547 ); | 1454 ); |
1548 } | 1455 } |
1549 #endif // HAS_ARGBTOUV411ROW_NEON | |
1550 | 1456 |
1551 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1457 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
1552 #define RGBTOUV(QB, QG, QR) \ | 1458 #define RGBTOUV(QB, QG, QR) \ |
1553 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ | 1459 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ |
1554 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ | 1460 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ |
1555 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ | 1461 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ |
1556 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ | 1462 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ |
1557 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ | 1463 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ |
1558 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ | 1464 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ |
1559 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1465 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ |
1560 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1466 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ |
1561 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ | 1467 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ |
1562 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ | 1468 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ |
1563 | 1469 |
1564 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. | 1470 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. |
1565 // TODO(fbarchard): consider ptrdiff_t for all strides. | 1471 // TODO(fbarchard): consider ptrdiff_t for all strides. |
1566 | 1472 |
1567 #ifdef HAS_ARGBTOUVROW_NEON | |
1568 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, | 1473 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, |
1569 uint8* dst_u, uint8* dst_v, int width) { | 1474 uint8* dst_u, uint8* dst_v, int width) { |
1570 const uint8* src_argb_1 = src_argb + src_stride_argb; | 1475 const uint8* src_argb_1 = src_argb + src_stride_argb; |
1571 asm volatile ( | 1476 asm volatile ( |
1572 RGBTOUV_SETUP_REG | 1477 RGBTOUV_SETUP_REG |
1573 "1: \n" | 1478 "1: \n" |
1574 MEMACCESS(0) | 1479 MEMACCESS(0) |
1575 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1480 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1576 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1481 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
1577 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1482 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
(...skipping 19 matching lines...) Expand all Loading... |
1597 : "+r"(src_argb), // %0 | 1502 : "+r"(src_argb), // %0 |
1598 "+r"(src_argb_1), // %1 | 1503 "+r"(src_argb_1), // %1 |
1599 "+r"(dst_u), // %2 | 1504 "+r"(dst_u), // %2 |
1600 "+r"(dst_v), // %3 | 1505 "+r"(dst_v), // %3 |
1601 "+r"(width) // %4 | 1506 "+r"(width) // %4 |
1602 : | 1507 : |
1603 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1508 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1604 "v20", "v21", "v22", "v23", "v24", "v25" | 1509 "v20", "v21", "v22", "v23", "v24", "v25" |
1605 ); | 1510 ); |
1606 } | 1511 } |
1607 #endif // HAS_ARGBTOUVROW_NEON | |
1608 | 1512 |
1609 // TODO(fbarchard): Subsample match C code. | 1513 // TODO(fbarchard): Subsample match C code. |
1610 #ifdef HAS_ARGBTOUVJROW_NEON | |
1611 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, | 1514 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, |
1612 uint8* dst_u, uint8* dst_v, int width) { | 1515 uint8* dst_u, uint8* dst_v, int width) { |
1613 const uint8* src_argb_1 = src_argb + src_stride_argb; | 1516 const uint8* src_argb_1 = src_argb + src_stride_argb; |
1614 asm volatile ( | 1517 asm volatile ( |
1615 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 | 1518 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 |
1616 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 | 1519 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 |
1617 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 | 1520 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 |
1618 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 | 1521 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 |
1619 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 | 1522 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 |
1620 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) | 1523 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) |
(...skipping 23 matching lines...) Expand all Loading... |
1644 : "+r"(src_argb), // %0 | 1547 : "+r"(src_argb), // %0 |
1645 "+r"(src_argb_1), // %1 | 1548 "+r"(src_argb_1), // %1 |
1646 "+r"(dst_u), // %2 | 1549 "+r"(dst_u), // %2 |
1647 "+r"(dst_v), // %3 | 1550 "+r"(dst_v), // %3 |
1648 "+r"(width) // %4 | 1551 "+r"(width) // %4 |
1649 : | 1552 : |
1650 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1553 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1651 "v20", "v21", "v22", "v23", "v24", "v25" | 1554 "v20", "v21", "v22", "v23", "v24", "v25" |
1652 ); | 1555 ); |
1653 } | 1556 } |
1654 #endif // HAS_ARGBTOUVJROW_NEON | |
1655 | 1557 |
1656 #ifdef HAS_BGRATOUVROW_NEON | |
1657 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, | 1558 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, |
1658 uint8* dst_u, uint8* dst_v, int width) { | 1559 uint8* dst_u, uint8* dst_v, int width) { |
1659 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; | 1560 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; |
1660 asm volatile ( | 1561 asm volatile ( |
1661 RGBTOUV_SETUP_REG | 1562 RGBTOUV_SETUP_REG |
1662 "1: \n" | 1563 "1: \n" |
1663 MEMACCESS(0) | 1564 MEMACCESS(0) |
1664 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1565 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1665 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. | 1566 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. |
1666 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. | 1567 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. |
(...skipping 18 matching lines...) Expand all Loading... |
1685 : "+r"(src_bgra), // %0 | 1586 : "+r"(src_bgra), // %0 |
1686 "+r"(src_bgra_1), // %1 | 1587 "+r"(src_bgra_1), // %1 |
1687 "+r"(dst_u), // %2 | 1588 "+r"(dst_u), // %2 |
1688 "+r"(dst_v), // %3 | 1589 "+r"(dst_v), // %3 |
1689 "+r"(width) // %4 | 1590 "+r"(width) // %4 |
1690 : | 1591 : |
1691 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1592 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1692 "v20", "v21", "v22", "v23", "v24", "v25" | 1593 "v20", "v21", "v22", "v23", "v24", "v25" |
1693 ); | 1594 ); |
1694 } | 1595 } |
1695 #endif // HAS_BGRATOUVROW_NEON | |
1696 | 1596 |
1697 #ifdef HAS_ABGRTOUVROW_NEON | |
1698 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, | 1597 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, |
1699 uint8* dst_u, uint8* dst_v, int width) { | 1598 uint8* dst_u, uint8* dst_v, int width) { |
1700 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; | 1599 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; |
1701 asm volatile ( | 1600 asm volatile ( |
1702 RGBTOUV_SETUP_REG | 1601 RGBTOUV_SETUP_REG |
1703 "1: \n" | 1602 "1: \n" |
1704 MEMACCESS(0) | 1603 MEMACCESS(0) |
1705 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1604 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1706 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. | 1605 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. |
1707 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1606 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
(...skipping 18 matching lines...) Expand all Loading... |
1726 : "+r"(src_abgr), // %0 | 1625 : "+r"(src_abgr), // %0 |
1727 "+r"(src_abgr_1), // %1 | 1626 "+r"(src_abgr_1), // %1 |
1728 "+r"(dst_u), // %2 | 1627 "+r"(dst_u), // %2 |
1729 "+r"(dst_v), // %3 | 1628 "+r"(dst_v), // %3 |
1730 "+r"(width) // %4 | 1629 "+r"(width) // %4 |
1731 : | 1630 : |
1732 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1631 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1733 "v20", "v21", "v22", "v23", "v24", "v25" | 1632 "v20", "v21", "v22", "v23", "v24", "v25" |
1734 ); | 1633 ); |
1735 } | 1634 } |
1736 #endif // HAS_ABGRTOUVROW_NEON | |
1737 | 1635 |
1738 #ifdef HAS_RGBATOUVROW_NEON | |
1739 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, | 1636 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, |
1740 uint8* dst_u, uint8* dst_v, int width) { | 1637 uint8* dst_u, uint8* dst_v, int width) { |
1741 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; | 1638 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; |
1742 asm volatile ( | 1639 asm volatile ( |
1743 RGBTOUV_SETUP_REG | 1640 RGBTOUV_SETUP_REG |
1744 "1: \n" | 1641 "1: \n" |
1745 MEMACCESS(0) | 1642 MEMACCESS(0) |
1746 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1643 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1747 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. | 1644 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. |
1748 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. | 1645 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. |
(...skipping 18 matching lines...) Expand all Loading... |
1767 : "+r"(src_rgba), // %0 | 1664 : "+r"(src_rgba), // %0 |
1768 "+r"(src_rgba_1), // %1 | 1665 "+r"(src_rgba_1), // %1 |
1769 "+r"(dst_u), // %2 | 1666 "+r"(dst_u), // %2 |
1770 "+r"(dst_v), // %3 | 1667 "+r"(dst_v), // %3 |
1771 "+r"(width) // %4 | 1668 "+r"(width) // %4 |
1772 : | 1669 : |
1773 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1670 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1774 "v20", "v21", "v22", "v23", "v24", "v25" | 1671 "v20", "v21", "v22", "v23", "v24", "v25" |
1775 ); | 1672 ); |
1776 } | 1673 } |
1777 #endif // HAS_RGBATOUVROW_NEON | |
1778 | 1674 |
1779 #ifdef HAS_RGB24TOUVROW_NEON | |
1780 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, | 1675 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, |
1781 uint8* dst_u, uint8* dst_v, int width) { | 1676 uint8* dst_u, uint8* dst_v, int width) { |
1782 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; | 1677 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; |
1783 asm volatile ( | 1678 asm volatile ( |
1784 RGBTOUV_SETUP_REG | 1679 RGBTOUV_SETUP_REG |
1785 "1: \n" | 1680 "1: \n" |
1786 MEMACCESS(0) | 1681 MEMACCESS(0) |
1787 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. | 1682 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. |
1788 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1683 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
1789 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1684 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
(...skipping 18 matching lines...) Expand all Loading... |
1808 : "+r"(src_rgb24), // %0 | 1703 : "+r"(src_rgb24), // %0 |
1809 "+r"(src_rgb24_1), // %1 | 1704 "+r"(src_rgb24_1), // %1 |
1810 "+r"(dst_u), // %2 | 1705 "+r"(dst_u), // %2 |
1811 "+r"(dst_v), // %3 | 1706 "+r"(dst_v), // %3 |
1812 "+r"(width) // %4 | 1707 "+r"(width) // %4 |
1813 : | 1708 : |
1814 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1709 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1815 "v20", "v21", "v22", "v23", "v24", "v25" | 1710 "v20", "v21", "v22", "v23", "v24", "v25" |
1816 ); | 1711 ); |
1817 } | 1712 } |
1818 #endif // HAS_RGB24TOUVROW_NEON | |
1819 | 1713 |
1820 #ifdef HAS_RAWTOUVROW_NEON | |
1821 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, | 1714 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, |
1822 uint8* dst_u, uint8* dst_v, int width) { | 1715 uint8* dst_u, uint8* dst_v, int width) { |
1823 const uint8* src_raw_1 = src_raw + src_stride_raw; | 1716 const uint8* src_raw_1 = src_raw + src_stride_raw; |
1824 asm volatile ( | 1717 asm volatile ( |
1825 RGBTOUV_SETUP_REG | 1718 RGBTOUV_SETUP_REG |
1826 "1: \n" | 1719 "1: \n" |
1827 MEMACCESS(0) | 1720 MEMACCESS(0) |
1828 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. | 1721 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. |
1829 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. | 1722 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. |
1830 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1723 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
(...skipping 18 matching lines...) Expand all Loading... |
1849 : "+r"(src_raw), // %0 | 1742 : "+r"(src_raw), // %0 |
1850 "+r"(src_raw_1), // %1 | 1743 "+r"(src_raw_1), // %1 |
1851 "+r"(dst_u), // %2 | 1744 "+r"(dst_u), // %2 |
1852 "+r"(dst_v), // %3 | 1745 "+r"(dst_v), // %3 |
1853 "+r"(width) // %4 | 1746 "+r"(width) // %4 |
1854 : | 1747 : |
1855 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1748 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1856 "v20", "v21", "v22", "v23", "v24", "v25" | 1749 "v20", "v21", "v22", "v23", "v24", "v25" |
1857 ); | 1750 ); |
1858 } | 1751 } |
1859 #endif // HAS_RAWTOUVROW_NEON | |
1860 | 1752 |
1861 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1753 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
1862 #ifdef HAS_RGB565TOUVROW_NEON | |
1863 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, | 1754 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, |
1864 uint8* dst_u, uint8* dst_v, int width) { | 1755 uint8* dst_u, uint8* dst_v, int width) { |
1865 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; | 1756 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; |
1866 asm volatile ( | 1757 asm volatile ( |
1867 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 | 1758 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 |
1868 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 | 1759 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 |
1869 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 | 1760 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 |
1870 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 | 1761 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 |
1871 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 | 1762 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 |
1872 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) | 1763 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1925 "+r"(src_rgb565_1), // %1 | 1816 "+r"(src_rgb565_1), // %1 |
1926 "+r"(dst_u), // %2 | 1817 "+r"(dst_u), // %2 |
1927 "+r"(dst_v), // %3 | 1818 "+r"(dst_v), // %3 |
1928 "+r"(width) // %4 | 1819 "+r"(width) // %4 |
1929 : | 1820 : |
1930 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1821 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
1931 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", | 1822 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", |
1932 "v25", "v26", "v27" | 1823 "v25", "v26", "v27" |
1933 ); | 1824 ); |
1934 } | 1825 } |
1935 #endif // HAS_RGB565TOUVROW_NEON | |
1936 | 1826 |
1937 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1827 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
1938 #ifdef HAS_ARGB1555TOUVROW_NEON | |
1939 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, | 1828 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, |
1940 uint8* dst_u, uint8* dst_v, int width) { | 1829 uint8* dst_u, uint8* dst_v, int width) { |
1941 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; | 1830 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; |
1942 asm volatile ( | 1831 asm volatile ( |
1943 RGBTOUV_SETUP_REG | 1832 RGBTOUV_SETUP_REG |
1944 "1: \n" | 1833 "1: \n" |
1945 MEMACCESS(0) | 1834 MEMACCESS(0) |
1946 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 1835 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
1947 RGB555TOARGB | 1836 RGB555TOARGB |
1948 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. | 1837 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1996 "+r"(src_argb1555_1), // %1 | 1885 "+r"(src_argb1555_1), // %1 |
1997 "+r"(dst_u), // %2 | 1886 "+r"(dst_u), // %2 |
1998 "+r"(dst_v), // %3 | 1887 "+r"(dst_v), // %3 |
1999 "+r"(width) // %4 | 1888 "+r"(width) // %4 |
2000 : | 1889 : |
2001 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", | 1890 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
2002 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", | 1891 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", |
2003 "v26", "v27", "v28" | 1892 "v26", "v27", "v28" |
2004 ); | 1893 ); |
2005 } | 1894 } |
2006 #endif // HAS_ARGB1555TOUVROW_NEON | |
2007 | 1895 |
2008 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1896 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
2009 #ifdef HAS_ARGB4444TOUVROW_NEON | |
2010 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, | 1897 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, |
2011 uint8* dst_u, uint8* dst_v, int width) { | 1898 uint8* dst_u, uint8* dst_v, int width) { |
2012 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; | 1899 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; |
2013 asm volatile ( | 1900 asm volatile ( |
2014 RGBTOUV_SETUP_REG | 1901 RGBTOUV_SETUP_REG |
2015 "1: \n" | 1902 "1: \n" |
2016 MEMACCESS(0) | 1903 MEMACCESS(0) |
2017 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 1904 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
2018 ARGB4444TOARGB | 1905 ARGB4444TOARGB |
2019 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. | 1906 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2068 "+r"(dst_u), // %2 | 1955 "+r"(dst_u), // %2 |
2069 "+r"(dst_v), // %3 | 1956 "+r"(dst_v), // %3 |
2070 "+r"(width) // %4 | 1957 "+r"(width) // %4 |
2071 : | 1958 : |
2072 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", | 1959 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
2073 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", | 1960 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", |
2074 "v26", "v27", "v28" | 1961 "v26", "v27", "v28" |
2075 | 1962 |
2076 ); | 1963 ); |
2077 } | 1964 } |
2078 #endif // HAS_ARGB4444TOUVROW_NEON | |
2079 | 1965 |
2080 #ifdef HAS_RGB565TOYROW_NEON | |
2081 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { | 1966 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { |
2082 asm volatile ( | 1967 asm volatile ( |
2083 "movi v24.8b, #13 \n" // B * 0.1016 coefficient | 1968 "movi v24.8b, #13 \n" // B * 0.1016 coefficient |
2084 "movi v25.8b, #65 \n" // G * 0.5078 coefficient | 1969 "movi v25.8b, #65 \n" // G * 0.5078 coefficient |
2085 "movi v26.8b, #33 \n" // R * 0.2578 coefficient | 1970 "movi v26.8b, #33 \n" // R * 0.2578 coefficient |
2086 "movi v27.8b, #16 \n" // Add 16 constant | 1971 "movi v27.8b, #16 \n" // Add 16 constant |
2087 "1: \n" | 1972 "1: \n" |
2088 MEMACCESS(0) | 1973 MEMACCESS(0) |
2089 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. | 1974 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. |
2090 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1975 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2091 RGB565TOARGB | 1976 RGB565TOARGB |
2092 "umull v3.8h, v0.8b, v24.8b \n" // B | 1977 "umull v3.8h, v0.8b, v24.8b \n" // B |
2093 "umlal v3.8h, v1.8b, v25.8b \n" // G | 1978 "umlal v3.8h, v1.8b, v25.8b \n" // G |
2094 "umlal v3.8h, v2.8b, v26.8b \n" // R | 1979 "umlal v3.8h, v2.8b, v26.8b \n" // R |
2095 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 1980 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
2096 "uqadd v0.8b, v0.8b, v27.8b \n" | 1981 "uqadd v0.8b, v0.8b, v27.8b \n" |
2097 MEMACCESS(1) | 1982 MEMACCESS(1) |
2098 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1983 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2099 "b.gt 1b \n" | 1984 "b.gt 1b \n" |
2100 : "+r"(src_rgb565), // %0 | 1985 : "+r"(src_rgb565), // %0 |
2101 "+r"(dst_y), // %1 | 1986 "+r"(dst_y), // %1 |
2102 "+r"(width) // %2 | 1987 "+r"(width) // %2 |
2103 : | 1988 : |
2104 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", | 1989 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", |
2105 "v24", "v25", "v26", "v27" | 1990 "v24", "v25", "v26", "v27" |
2106 ); | 1991 ); |
2107 } | 1992 } |
2108 #endif // HAS_RGB565TOYROW_NEON | |
2109 | 1993 |
2110 #ifdef HAS_ARGB1555TOYROW_NEON | |
2111 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { | 1994 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { |
2112 asm volatile ( | 1995 asm volatile ( |
2113 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 1996 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
2114 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 1997 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2115 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 1998 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
2116 "movi v7.8b, #16 \n" // Add 16 constant | 1999 "movi v7.8b, #16 \n" // Add 16 constant |
2117 "1: \n" | 2000 "1: \n" |
2118 MEMACCESS(0) | 2001 MEMACCESS(0) |
2119 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 2002 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
2120 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2003 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2121 ARGB1555TOARGB | 2004 ARGB1555TOARGB |
2122 "umull v3.8h, v0.8b, v4.8b \n" // B | 2005 "umull v3.8h, v0.8b, v4.8b \n" // B |
2123 "umlal v3.8h, v1.8b, v5.8b \n" // G | 2006 "umlal v3.8h, v1.8b, v5.8b \n" // G |
2124 "umlal v3.8h, v2.8b, v6.8b \n" // R | 2007 "umlal v3.8h, v2.8b, v6.8b \n" // R |
2125 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2008 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
2126 "uqadd v0.8b, v0.8b, v7.8b \n" | 2009 "uqadd v0.8b, v0.8b, v7.8b \n" |
2127 MEMACCESS(1) | 2010 MEMACCESS(1) |
2128 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2011 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2129 "b.gt 1b \n" | 2012 "b.gt 1b \n" |
2130 : "+r"(src_argb1555), // %0 | 2013 : "+r"(src_argb1555), // %0 |
2131 "+r"(dst_y), // %1 | 2014 "+r"(dst_y), // %1 |
2132 "+r"(width) // %2 | 2015 "+r"(width) // %2 |
2133 : | 2016 : |
2134 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2017 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
2135 ); | 2018 ); |
2136 } | 2019 } |
2137 #endif // HAS_ARGB1555TOYROW_NEON | |
2138 | 2020 |
2139 #ifdef HAS_ARGB4444TOYROW_NEON | |
2140 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { | 2021 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { |
2141 asm volatile ( | 2022 asm volatile ( |
2142 "movi v24.8b, #13 \n" // B * 0.1016 coefficient | 2023 "movi v24.8b, #13 \n" // B * 0.1016 coefficient |
2143 "movi v25.8b, #65 \n" // G * 0.5078 coefficient | 2024 "movi v25.8b, #65 \n" // G * 0.5078 coefficient |
2144 "movi v26.8b, #33 \n" // R * 0.2578 coefficient | 2025 "movi v26.8b, #33 \n" // R * 0.2578 coefficient |
2145 "movi v27.8b, #16 \n" // Add 16 constant | 2026 "movi v27.8b, #16 \n" // Add 16 constant |
2146 "1: \n" | 2027 "1: \n" |
2147 MEMACCESS(0) | 2028 MEMACCESS(0) |
2148 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 2029 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
2149 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2030 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2150 ARGB4444TOARGB | 2031 ARGB4444TOARGB |
2151 "umull v3.8h, v0.8b, v24.8b \n" // B | 2032 "umull v3.8h, v0.8b, v24.8b \n" // B |
2152 "umlal v3.8h, v1.8b, v25.8b \n" // G | 2033 "umlal v3.8h, v1.8b, v25.8b \n" // G |
2153 "umlal v3.8h, v2.8b, v26.8b \n" // R | 2034 "umlal v3.8h, v2.8b, v26.8b \n" // R |
2154 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2035 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
2155 "uqadd v0.8b, v0.8b, v27.8b \n" | 2036 "uqadd v0.8b, v0.8b, v27.8b \n" |
2156 MEMACCESS(1) | 2037 MEMACCESS(1) |
2157 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2038 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2158 "b.gt 1b \n" | 2039 "b.gt 1b \n" |
2159 : "+r"(src_argb4444), // %0 | 2040 : "+r"(src_argb4444), // %0 |
2160 "+r"(dst_y), // %1 | 2041 "+r"(dst_y), // %1 |
2161 "+r"(width) // %2 | 2042 "+r"(width) // %2 |
2162 : | 2043 : |
2163 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" | 2044 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" |
2164 ); | 2045 ); |
2165 } | 2046 } |
2166 #endif // HAS_ARGB4444TOYROW_NEON | |
2167 | 2047 |
2168 #ifdef HAS_BGRATOYROW_NEON | |
2169 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { | 2048 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { |
2170 asm volatile ( | 2049 asm volatile ( |
2171 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2050 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
2172 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2051 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2173 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2052 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
2174 "movi v7.8b, #16 \n" // Add 16 constant | 2053 "movi v7.8b, #16 \n" // Add 16 constant |
2175 "1: \n" | 2054 "1: \n" |
2176 MEMACCESS(0) | 2055 MEMACCESS(0) |
2177 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2056 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
2178 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2057 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2179 "umull v16.8h, v1.8b, v4.8b \n" // R | 2058 "umull v16.8h, v1.8b, v4.8b \n" // R |
2180 "umlal v16.8h, v2.8b, v5.8b \n" // G | 2059 "umlal v16.8h, v2.8b, v5.8b \n" // G |
2181 "umlal v16.8h, v3.8b, v6.8b \n" // B | 2060 "umlal v16.8h, v3.8b, v6.8b \n" // B |
2182 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2061 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2183 "uqadd v0.8b, v0.8b, v7.8b \n" | 2062 "uqadd v0.8b, v0.8b, v7.8b \n" |
2184 MEMACCESS(1) | 2063 MEMACCESS(1) |
2185 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2064 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2186 "b.gt 1b \n" | 2065 "b.gt 1b \n" |
2187 : "+r"(src_bgra), // %0 | 2066 : "+r"(src_bgra), // %0 |
2188 "+r"(dst_y), // %1 | 2067 "+r"(dst_y), // %1 |
2189 "+r"(width) // %2 | 2068 "+r"(width) // %2 |
2190 : | 2069 : |
2191 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2070 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2192 ); | 2071 ); |
2193 } | 2072 } |
2194 #endif // HAS_BGRATOYROW_NEON | |
2195 | 2073 |
2196 #ifdef HAS_ABGRTOYROW_NEON | |
2197 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { | 2074 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { |
2198 asm volatile ( | 2075 asm volatile ( |
2199 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2076 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
2200 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2077 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2201 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2078 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
2202 "movi v7.8b, #16 \n" // Add 16 constant | 2079 "movi v7.8b, #16 \n" // Add 16 constant |
2203 "1: \n" | 2080 "1: \n" |
2204 MEMACCESS(0) | 2081 MEMACCESS(0) |
2205 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2082 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
2206 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2083 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2207 "umull v16.8h, v0.8b, v4.8b \n" // R | 2084 "umull v16.8h, v0.8b, v4.8b \n" // R |
2208 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2085 "umlal v16.8h, v1.8b, v5.8b \n" // G |
2209 "umlal v16.8h, v2.8b, v6.8b \n" // B | 2086 "umlal v16.8h, v2.8b, v6.8b \n" // B |
2210 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2087 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2211 "uqadd v0.8b, v0.8b, v7.8b \n" | 2088 "uqadd v0.8b, v0.8b, v7.8b \n" |
2212 MEMACCESS(1) | 2089 MEMACCESS(1) |
2213 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2090 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2214 "b.gt 1b \n" | 2091 "b.gt 1b \n" |
2215 : "+r"(src_abgr), // %0 | 2092 : "+r"(src_abgr), // %0 |
2216 "+r"(dst_y), // %1 | 2093 "+r"(dst_y), // %1 |
2217 "+r"(width) // %2 | 2094 "+r"(width) // %2 |
2218 : | 2095 : |
2219 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2096 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2220 ); | 2097 ); |
2221 } | 2098 } |
2222 #endif // HAS_ABGRTOYROW_NEON | |
2223 | 2099 |
2224 #ifdef HAS_RGBATOYROW_NEON | |
2225 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { | 2100 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { |
2226 asm volatile ( | 2101 asm volatile ( |
2227 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2102 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
2228 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2103 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2229 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2104 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
2230 "movi v7.8b, #16 \n" // Add 16 constant | 2105 "movi v7.8b, #16 \n" // Add 16 constant |
2231 "1: \n" | 2106 "1: \n" |
2232 MEMACCESS(0) | 2107 MEMACCESS(0) |
2233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2108 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
2234 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2109 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2235 "umull v16.8h, v1.8b, v4.8b \n" // B | 2110 "umull v16.8h, v1.8b, v4.8b \n" // B |
2236 "umlal v16.8h, v2.8b, v5.8b \n" // G | 2111 "umlal v16.8h, v2.8b, v5.8b \n" // G |
2237 "umlal v16.8h, v3.8b, v6.8b \n" // R | 2112 "umlal v16.8h, v3.8b, v6.8b \n" // R |
2238 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2113 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2239 "uqadd v0.8b, v0.8b, v7.8b \n" | 2114 "uqadd v0.8b, v0.8b, v7.8b \n" |
2240 MEMACCESS(1) | 2115 MEMACCESS(1) |
2241 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2116 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2242 "b.gt 1b \n" | 2117 "b.gt 1b \n" |
2243 : "+r"(src_rgba), // %0 | 2118 : "+r"(src_rgba), // %0 |
2244 "+r"(dst_y), // %1 | 2119 "+r"(dst_y), // %1 |
2245 "+r"(width) // %2 | 2120 "+r"(width) // %2 |
2246 : | 2121 : |
2247 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2122 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2248 ); | 2123 ); |
2249 } | 2124 } |
2250 #endif // HAS_RGBATOYROW_NEON | |
2251 | 2125 |
2252 #ifdef HAS_RGB24TOYROW_NEON | |
2253 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { | 2126 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { |
2254 asm volatile ( | 2127 asm volatile ( |
2255 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2128 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
2256 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2129 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2257 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2130 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
2258 "movi v7.8b, #16 \n" // Add 16 constant | 2131 "movi v7.8b, #16 \n" // Add 16 constant |
2259 "1: \n" | 2132 "1: \n" |
2260 MEMACCESS(0) | 2133 MEMACCESS(0) |
2261 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. | 2134 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. |
2262 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2135 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2263 "umull v16.8h, v0.8b, v4.8b \n" // B | 2136 "umull v16.8h, v0.8b, v4.8b \n" // B |
2264 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2137 "umlal v16.8h, v1.8b, v5.8b \n" // G |
2265 "umlal v16.8h, v2.8b, v6.8b \n" // R | 2138 "umlal v16.8h, v2.8b, v6.8b \n" // R |
2266 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2139 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2267 "uqadd v0.8b, v0.8b, v7.8b \n" | 2140 "uqadd v0.8b, v0.8b, v7.8b \n" |
2268 MEMACCESS(1) | 2141 MEMACCESS(1) |
2269 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2142 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2270 "b.gt 1b \n" | 2143 "b.gt 1b \n" |
2271 : "+r"(src_rgb24), // %0 | 2144 : "+r"(src_rgb24), // %0 |
2272 "+r"(dst_y), // %1 | 2145 "+r"(dst_y), // %1 |
2273 "+r"(width) // %2 | 2146 "+r"(width) // %2 |
2274 : | 2147 : |
2275 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2148 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2276 ); | 2149 ); |
2277 } | 2150 } |
2278 #endif // HAS_RGB24TOYROW_NEON | |
2279 | 2151 |
2280 #ifdef HAS_RAWTOYROW_NEON | |
2281 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { | 2152 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { |
2282 asm volatile ( | 2153 asm volatile ( |
2283 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2154 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
2284 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2155 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2285 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2156 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
2286 "movi v7.8b, #16 \n" // Add 16 constant | 2157 "movi v7.8b, #16 \n" // Add 16 constant |
2287 "1: \n" | 2158 "1: \n" |
2288 MEMACCESS(0) | 2159 MEMACCESS(0) |
2289 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. | 2160 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. |
2290 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2161 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2291 "umull v16.8h, v0.8b, v4.8b \n" // B | 2162 "umull v16.8h, v0.8b, v4.8b \n" // B |
2292 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2163 "umlal v16.8h, v1.8b, v5.8b \n" // G |
2293 "umlal v16.8h, v2.8b, v6.8b \n" // R | 2164 "umlal v16.8h, v2.8b, v6.8b \n" // R |
2294 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2165 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2295 "uqadd v0.8b, v0.8b, v7.8b \n" | 2166 "uqadd v0.8b, v0.8b, v7.8b \n" |
2296 MEMACCESS(1) | 2167 MEMACCESS(1) |
2297 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2168 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2298 "b.gt 1b \n" | 2169 "b.gt 1b \n" |
2299 : "+r"(src_raw), // %0 | 2170 : "+r"(src_raw), // %0 |
2300 "+r"(dst_y), // %1 | 2171 "+r"(dst_y), // %1 |
2301 "+r"(width) // %2 | 2172 "+r"(width) // %2 |
2302 : | 2173 : |
2303 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2174 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2304 ); | 2175 ); |
2305 } | 2176 } |
2306 #endif // HAS_RAWTOYROW_NEON | |
2307 | 2177 |
2308 // Bilinear filter 16x2 -> 16x1 | 2178 // Bilinear filter 16x2 -> 16x1 |
2309 #ifdef HAS_INTERPOLATEROW_NEON | |
2310 void InterpolateRow_NEON(uint8* dst_ptr, | 2179 void InterpolateRow_NEON(uint8* dst_ptr, |
2311 const uint8* src_ptr, ptrdiff_t src_stride, | 2180 const uint8* src_ptr, ptrdiff_t src_stride, |
2312 int dst_width, int source_y_fraction) { | 2181 int dst_width, int source_y_fraction) { |
2313 int y1_fraction = source_y_fraction; | 2182 int y1_fraction = source_y_fraction; |
2314 int y0_fraction = 256 - y1_fraction; | 2183 int y0_fraction = 256 - y1_fraction; |
2315 const uint8* src_ptr1 = src_ptr + src_stride; | 2184 const uint8* src_ptr1 = src_ptr + src_stride; |
2316 asm volatile ( | 2185 asm volatile ( |
2317 "cmp %w4, #0 \n" | 2186 "cmp %w4, #0 \n" |
2318 "b.eq 100f \n" | 2187 "b.eq 100f \n" |
2319 "cmp %w4, #128 \n" | 2188 "cmp %w4, #128 \n" |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2365 : "+r"(dst_ptr), // %0 | 2234 : "+r"(dst_ptr), // %0 |
2366 "+r"(src_ptr), // %1 | 2235 "+r"(src_ptr), // %1 |
2367 "+r"(src_ptr1), // %2 | 2236 "+r"(src_ptr1), // %2 |
2368 "+r"(dst_width), // %3 | 2237 "+r"(dst_width), // %3 |
2369 "+r"(y1_fraction), // %4 | 2238 "+r"(y1_fraction), // %4 |
2370 "+r"(y0_fraction) // %5 | 2239 "+r"(y0_fraction) // %5 |
2371 : | 2240 : |
2372 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" | 2241 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" |
2373 ); | 2242 ); |
2374 } | 2243 } |
2375 #endif // HAS_INTERPOLATEROW_NEON | |
2376 | 2244 |
2377 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr | 2245 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr |
2378 #ifdef HAS_ARGBBLENDROW_NEON | |
2379 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2246 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
2380 uint8* dst_argb, int width) { | 2247 uint8* dst_argb, int width) { |
2381 asm volatile ( | 2248 asm volatile ( |
2382 "subs %w3, %w3, #8 \n" | 2249 "subs %w3, %w3, #8 \n" |
2383 "b.lt 89f \n" | 2250 "b.lt 89f \n" |
2384 // Blend 8 pixels. | 2251 // Blend 8 pixels. |
2385 "8: \n" | 2252 "8: \n" |
2386 MEMACCESS(0) | 2253 MEMACCESS(0) |
2387 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels | 2254 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels |
2388 MEMACCESS(1) | 2255 MEMACCESS(1) |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2437 | 2304 |
2438 : "+r"(src_argb0), // %0 | 2305 : "+r"(src_argb0), // %0 |
2439 "+r"(src_argb1), // %1 | 2306 "+r"(src_argb1), // %1 |
2440 "+r"(dst_argb), // %2 | 2307 "+r"(dst_argb), // %2 |
2441 "+r"(width) // %3 | 2308 "+r"(width) // %3 |
2442 : | 2309 : |
2443 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 2310 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
2444 "v16", "v17", "v18" | 2311 "v16", "v17", "v18" |
2445 ); | 2312 ); |
2446 } | 2313 } |
2447 #endif // HAS_ARGBBLENDROW_NEON | |
2448 | 2314 |
2449 // Attenuate 8 pixels at a time. | 2315 // Attenuate 8 pixels at a time. |
2450 #ifdef HAS_ARGBATTENUATEROW_NEON | |
2451 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { | 2316 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { |
2452 asm volatile ( | 2317 asm volatile ( |
2453 // Attenuate 8 pixels. | 2318 // Attenuate 8 pixels. |
2454 "1: \n" | 2319 "1: \n" |
2455 MEMACCESS(0) | 2320 MEMACCESS(0) |
2456 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels | 2321 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels |
2457 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2322 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2458 "umull v4.8h, v0.8b, v3.8b \n" // b * a | 2323 "umull v4.8h, v0.8b, v3.8b \n" // b * a |
2459 "umull v5.8h, v1.8b, v3.8b \n" // g * a | 2324 "umull v5.8h, v1.8b, v3.8b \n" // g * a |
2460 "umull v6.8h, v2.8b, v3.8b \n" // r * a | 2325 "umull v6.8h, v2.8b, v3.8b \n" // r * a |
2461 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 | 2326 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 |
2462 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 | 2327 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 |
2463 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 | 2328 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 |
2464 MEMACCESS(1) | 2329 MEMACCESS(1) |
2465 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 2330 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
2466 "b.gt 1b \n" | 2331 "b.gt 1b \n" |
2467 : "+r"(src_argb), // %0 | 2332 : "+r"(src_argb), // %0 |
2468 "+r"(dst_argb), // %1 | 2333 "+r"(dst_argb), // %1 |
2469 "+r"(width) // %2 | 2334 "+r"(width) // %2 |
2470 : | 2335 : |
2471 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 2336 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |
2472 ); | 2337 ); |
2473 } | 2338 } |
2474 #endif // HAS_ARGBATTENUATEROW_NEON | |
2475 | 2339 |
2476 // Quantize 8 ARGB pixels (32 bytes). | 2340 // Quantize 8 ARGB pixels (32 bytes). |
2477 // dst = (dst * scale >> 16) * interval_size + interval_offset; | 2341 // dst = (dst * scale >> 16) * interval_size + interval_offset; |
2478 #ifdef HAS_ARGBQUANTIZEROW_NEON | |
2479 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, | 2342 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, |
2480 int interval_offset, int width) { | 2343 int interval_offset, int width) { |
2481 asm volatile ( | 2344 asm volatile ( |
2482 "dup v4.8h, %w2 \n" | 2345 "dup v4.8h, %w2 \n" |
2483 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 | 2346 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 |
2484 "dup v5.8h, %w3 \n" // interval multiply. | 2347 "dup v5.8h, %w3 \n" // interval multiply. |
2485 "dup v6.8h, %w4 \n" // interval add | 2348 "dup v6.8h, %w4 \n" // interval add |
2486 | 2349 |
2487 // 8 pixel loop. | 2350 // 8 pixel loop. |
2488 "1: \n" | 2351 "1: \n" |
(...skipping 19 matching lines...) Expand all Loading... |
2508 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels | 2371 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels |
2509 "b.gt 1b \n" | 2372 "b.gt 1b \n" |
2510 : "+r"(dst_argb), // %0 | 2373 : "+r"(dst_argb), // %0 |
2511 "+r"(width) // %1 | 2374 "+r"(width) // %1 |
2512 : "r"(scale), // %2 | 2375 : "r"(scale), // %2 |
2513 "r"(interval_size), // %3 | 2376 "r"(interval_size), // %3 |
2514 "r"(interval_offset) // %4 | 2377 "r"(interval_offset) // %4 |
2515 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 2378 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |
2516 ); | 2379 ); |
2517 } | 2380 } |
2518 #endif // HAS_ARGBQUANTIZEROW_NEON | |
2519 | 2381 |
2520 // Shade 8 pixels at a time by specified value. | 2382 // Shade 8 pixels at a time by specified value. |
2521 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. | 2383 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. |
2522 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. | 2384 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. |
2523 #ifdef HAS_ARGBSHADEROW_NEON | |
2524 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, | 2385 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, |
2525 uint32 value) { | 2386 uint32 value) { |
2526 asm volatile ( | 2387 asm volatile ( |
2527 "dup v0.4s, %w3 \n" // duplicate scale value. | 2388 "dup v0.4s, %w3 \n" // duplicate scale value. |
2528 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. | 2389 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. |
2529 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. | 2390 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. |
2530 | 2391 |
2531 // 8 pixel loop. | 2392 // 8 pixel loop. |
2532 "1: \n" | 2393 "1: \n" |
2533 MEMACCESS(0) | 2394 MEMACCESS(0) |
(...skipping 14 matching lines...) Expand all Loading... |
2548 MEMACCESS(1) | 2409 MEMACCESS(1) |
2549 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels | 2410 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels |
2550 "b.gt 1b \n" | 2411 "b.gt 1b \n" |
2551 : "+r"(src_argb), // %0 | 2412 : "+r"(src_argb), // %0 |
2552 "+r"(dst_argb), // %1 | 2413 "+r"(dst_argb), // %1 |
2553 "+r"(width) // %2 | 2414 "+r"(width) // %2 |
2554 : "r"(value) // %3 | 2415 : "r"(value) // %3 |
2555 : "cc", "memory", "v0", "v4", "v5", "v6", "v7" | 2416 : "cc", "memory", "v0", "v4", "v5", "v6", "v7" |
2556 ); | 2417 ); |
2557 } | 2418 } |
2558 #endif // HAS_ARGBSHADEROW_NEON | |
2559 | 2419 |
2560 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels | 2420 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels |
2561 // Similar to ARGBToYJ but stores ARGB. | 2421 // Similar to ARGBToYJ but stores ARGB. |
2562 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; | 2422 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; |
2563 #ifdef HAS_ARGBGRAYROW_NEON | |
2564 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { | 2423 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { |
2565 asm volatile ( | 2424 asm volatile ( |
2566 "movi v24.8b, #15 \n" // B * 0.11400 coefficient | 2425 "movi v24.8b, #15 \n" // B * 0.11400 coefficient |
2567 "movi v25.8b, #75 \n" // G * 0.58700 coefficient | 2426 "movi v25.8b, #75 \n" // G * 0.58700 coefficient |
2568 "movi v26.8b, #38 \n" // R * 0.29900 coefficient | 2427 "movi v26.8b, #38 \n" // R * 0.29900 coefficient |
2569 "1: \n" | 2428 "1: \n" |
2570 MEMACCESS(0) | 2429 MEMACCESS(0) |
2571 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2430 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
2572 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2431 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2573 "umull v4.8h, v0.8b, v24.8b \n" // B | 2432 "umull v4.8h, v0.8b, v24.8b \n" // B |
2574 "umlal v4.8h, v1.8b, v25.8b \n" // G | 2433 "umlal v4.8h, v1.8b, v25.8b \n" // G |
2575 "umlal v4.8h, v2.8b, v26.8b \n" // R | 2434 "umlal v4.8h, v2.8b, v26.8b \n" // R |
2576 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B | 2435 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B |
2577 "orr v1.8b, v0.8b, v0.8b \n" // G | 2436 "orr v1.8b, v0.8b, v0.8b \n" // G |
2578 "orr v2.8b, v0.8b, v0.8b \n" // R | 2437 "orr v2.8b, v0.8b, v0.8b \n" // R |
2579 MEMACCESS(1) | 2438 MEMACCESS(1) |
2580 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. | 2439 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. |
2581 "b.gt 1b \n" | 2440 "b.gt 1b \n" |
2582 : "+r"(src_argb), // %0 | 2441 : "+r"(src_argb), // %0 |
2583 "+r"(dst_argb), // %1 | 2442 "+r"(dst_argb), // %1 |
2584 "+r"(width) // %2 | 2443 "+r"(width) // %2 |
2585 : | 2444 : |
2586 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" | 2445 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" |
2587 ); | 2446 ); |
2588 } | 2447 } |
2589 #endif // HAS_ARGBGRAYROW_NEON | |
2590 | 2448 |
2591 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | 2449 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
2592 // b = (r * 35 + g * 68 + b * 17) >> 7 | 2450 // b = (r * 35 + g * 68 + b * 17) >> 7 |
2593 // g = (r * 45 + g * 88 + b * 22) >> 7 | 2451 // g = (r * 45 + g * 88 + b * 22) >> 7 |
2594 // r = (r * 50 + g * 98 + b * 24) >> 7 | 2452 // r = (r * 50 + g * 98 + b * 24) >> 7 |
2595 | 2453 |
2596 #ifdef HAS_ARGBSEPIAROW_NEON | |
2597 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { | 2454 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { |
2598 asm volatile ( | 2455 asm volatile ( |
2599 "movi v20.8b, #17 \n" // BB coefficient | 2456 "movi v20.8b, #17 \n" // BB coefficient |
2600 "movi v21.8b, #68 \n" // BG coefficient | 2457 "movi v21.8b, #68 \n" // BG coefficient |
2601 "movi v22.8b, #35 \n" // BR coefficient | 2458 "movi v22.8b, #35 \n" // BR coefficient |
2602 "movi v24.8b, #22 \n" // GB coefficient | 2459 "movi v24.8b, #22 \n" // GB coefficient |
2603 "movi v25.8b, #88 \n" // GG coefficient | 2460 "movi v25.8b, #88 \n" // GG coefficient |
2604 "movi v26.8b, #45 \n" // GR coefficient | 2461 "movi v26.8b, #45 \n" // GR coefficient |
2605 "movi v28.8b, #24 \n" // BB coefficient | 2462 "movi v28.8b, #24 \n" // BB coefficient |
2606 "movi v29.8b, #98 \n" // BG coefficient | 2463 "movi v29.8b, #98 \n" // BG coefficient |
(...skipping 17 matching lines...) Expand all Loading... |
2624 MEMACCESS(0) | 2481 MEMACCESS(0) |
2625 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. | 2482 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. |
2626 "b.gt 1b \n" | 2483 "b.gt 1b \n" |
2627 : "+r"(dst_argb), // %0 | 2484 : "+r"(dst_argb), // %0 |
2628 "+r"(width) // %1 | 2485 "+r"(width) // %1 |
2629 : | 2486 : |
2630 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 2487 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
2631 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" | 2488 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" |
2632 ); | 2489 ); |
2633 } | 2490 } |
2634 #endif // HAS_ARGBSEPIAROW_NEON | |
2635 | 2491 |
2636 // Tranform 8 ARGB pixels (32 bytes) with color matrix. | 2492 // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
2637 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function | 2493 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function |
2638 // needs to saturate. Consider doing a non-saturating version. | 2494 // needs to saturate. Consider doing a non-saturating version. |
2639 #ifdef HAS_ARGBCOLORMATRIXROW_NEON | |
2640 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, | 2495 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, |
2641 const int8* matrix_argb, int width) { | 2496 const int8* matrix_argb, int width) { |
2642 asm volatile ( | 2497 asm volatile ( |
2643 MEMACCESS(3) | 2498 MEMACCESS(3) |
2644 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. | 2499 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. |
2645 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. | 2500 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. |
2646 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. | 2501 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. |
2647 | 2502 |
2648 "1: \n" | 2503 "1: \n" |
2649 MEMACCESS(0) | 2504 MEMACCESS(0) |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2689 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. | 2544 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. |
2690 "b.gt 1b \n" | 2545 "b.gt 1b \n" |
2691 : "+r"(src_argb), // %0 | 2546 : "+r"(src_argb), // %0 |
2692 "+r"(dst_argb), // %1 | 2547 "+r"(dst_argb), // %1 |
2693 "+r"(width) // %2 | 2548 "+r"(width) // %2 |
2694 : "r"(matrix_argb) // %3 | 2549 : "r"(matrix_argb) // %3 |
2695 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"
, | 2550 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"
, |
2696 "v18", "v19", "v22", "v23", "v24", "v25" | 2551 "v18", "v19", "v22", "v23", "v24", "v25" |
2697 ); | 2552 ); |
2698 } | 2553 } |
2699 #endif // HAS_ARGBCOLORMATRIXROW_NEON | |
2700 | 2554 |
2701 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. | 2555 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. |
2702 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | 2556 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
2703 #ifdef HAS_ARGBMULTIPLYROW_NEON | |
2704 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2557 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
2705 uint8* dst_argb, int width) { | 2558 uint8* dst_argb, int width) { |
2706 asm volatile ( | 2559 asm volatile ( |
2707 // 8 pixel loop. | 2560 // 8 pixel loop. |
2708 "1: \n" | 2561 "1: \n" |
2709 MEMACCESS(0) | 2562 MEMACCESS(0) |
2710 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2563 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
2711 MEMACCESS(1) | 2564 MEMACCESS(1) |
2712 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. | 2565 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. |
2713 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 2566 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
(...skipping 10 matching lines...) Expand all Loading... |
2724 "b.gt 1b \n" | 2577 "b.gt 1b \n" |
2725 | 2578 |
2726 : "+r"(src_argb0), // %0 | 2579 : "+r"(src_argb0), // %0 |
2727 "+r"(src_argb1), // %1 | 2580 "+r"(src_argb1), // %1 |
2728 "+r"(dst_argb), // %2 | 2581 "+r"(dst_argb), // %2 |
2729 "+r"(width) // %3 | 2582 "+r"(width) // %3 |
2730 : | 2583 : |
2731 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
2732 ); | 2585 ); |
2733 } | 2586 } |
2734 #endif // HAS_ARGBMULTIPLYROW_NEON | |
2735 | 2587 |
2736 // Add 2 rows of ARGB pixels together, 8 pixels at a time. | 2588 // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
2737 #ifdef HAS_ARGBADDROW_NEON | |
2738 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2589 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
2739 uint8* dst_argb, int width) { | 2590 uint8* dst_argb, int width) { |
2740 asm volatile ( | 2591 asm volatile ( |
2741 // 8 pixel loop. | 2592 // 8 pixel loop. |
2742 "1: \n" | 2593 "1: \n" |
2743 MEMACCESS(0) | 2594 MEMACCESS(0) |
2744 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2595 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
2745 MEMACCESS(1) | 2596 MEMACCESS(1) |
2746 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. | 2597 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. |
2747 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 2598 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
2748 "uqadd v0.8b, v0.8b, v4.8b \n" | 2599 "uqadd v0.8b, v0.8b, v4.8b \n" |
2749 "uqadd v1.8b, v1.8b, v5.8b \n" | 2600 "uqadd v1.8b, v1.8b, v5.8b \n" |
2750 "uqadd v2.8b, v2.8b, v6.8b \n" | 2601 "uqadd v2.8b, v2.8b, v6.8b \n" |
2751 "uqadd v3.8b, v3.8b, v7.8b \n" | 2602 "uqadd v3.8b, v3.8b, v7.8b \n" |
2752 MEMACCESS(2) | 2603 MEMACCESS(2) |
2753 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2604 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
2754 "b.gt 1b \n" | 2605 "b.gt 1b \n" |
2755 | 2606 |
2756 : "+r"(src_argb0), // %0 | 2607 : "+r"(src_argb0), // %0 |
2757 "+r"(src_argb1), // %1 | 2608 "+r"(src_argb1), // %1 |
2758 "+r"(dst_argb), // %2 | 2609 "+r"(dst_argb), // %2 |
2759 "+r"(width) // %3 | 2610 "+r"(width) // %3 |
2760 : | 2611 : |
2761 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2612 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
2762 ); | 2613 ); |
2763 } | 2614 } |
2764 #endif // HAS_ARGBADDROW_NEON | |
2765 | 2615 |
2766 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. | 2616 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. |
2767 #ifdef HAS_ARGBSUBTRACTROW_NEON | |
2768 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2617 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
2769 uint8* dst_argb, int width) { | 2618 uint8* dst_argb, int width) { |
2770 asm volatile ( | 2619 asm volatile ( |
2771 // 8 pixel loop. | 2620 // 8 pixel loop. |
2772 "1: \n" | 2621 "1: \n" |
2773 MEMACCESS(0) | 2622 MEMACCESS(0) |
2774 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2623 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
2775 MEMACCESS(1) | 2624 MEMACCESS(1) |
2776 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. | 2625 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. |
2777 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 2626 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
2778 "uqsub v0.8b, v0.8b, v4.8b \n" | 2627 "uqsub v0.8b, v0.8b, v4.8b \n" |
2779 "uqsub v1.8b, v1.8b, v5.8b \n" | 2628 "uqsub v1.8b, v1.8b, v5.8b \n" |
2780 "uqsub v2.8b, v2.8b, v6.8b \n" | 2629 "uqsub v2.8b, v2.8b, v6.8b \n" |
2781 "uqsub v3.8b, v3.8b, v7.8b \n" | 2630 "uqsub v3.8b, v3.8b, v7.8b \n" |
2782 MEMACCESS(2) | 2631 MEMACCESS(2) |
2783 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2632 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
2784 "b.gt 1b \n" | 2633 "b.gt 1b \n" |
2785 | 2634 |
2786 : "+r"(src_argb0), // %0 | 2635 : "+r"(src_argb0), // %0 |
2787 "+r"(src_argb1), // %1 | 2636 "+r"(src_argb1), // %1 |
2788 "+r"(dst_argb), // %2 | 2637 "+r"(dst_argb), // %2 |
2789 "+r"(width) // %3 | 2638 "+r"(width) // %3 |
2790 : | 2639 : |
2791 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2640 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
2792 ); | 2641 ); |
2793 } | 2642 } |
2794 #endif // HAS_ARGBSUBTRACTROW_NEON | |
2795 | 2643 |
2796 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | 2644 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
2797 // A = 255 | 2645 // A = 255 |
2798 // R = Sobel | 2646 // R = Sobel |
2799 // G = Sobel | 2647 // G = Sobel |
2800 // B = Sobel | 2648 // B = Sobel |
2801 #ifdef HAS_SOBELROW_NEON | |
2802 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 2649 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
2803 uint8* dst_argb, int width) { | 2650 uint8* dst_argb, int width) { |
2804 asm volatile ( | 2651 asm volatile ( |
2805 "movi v3.8b, #255 \n" // alpha | 2652 "movi v3.8b, #255 \n" // alpha |
2806 // 8 pixel loop. | 2653 // 8 pixel loop. |
2807 "1: \n" | 2654 "1: \n" |
2808 MEMACCESS(0) | 2655 MEMACCESS(0) |
2809 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. | 2656 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. |
2810 MEMACCESS(1) | 2657 MEMACCESS(1) |
2811 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. | 2658 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. |
2812 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 2659 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
2813 "uqadd v0.8b, v0.8b, v1.8b \n" // add | 2660 "uqadd v0.8b, v0.8b, v1.8b \n" // add |
2814 "orr v1.8b, v0.8b, v0.8b \n" | 2661 "orr v1.8b, v0.8b, v0.8b \n" |
2815 "orr v2.8b, v0.8b, v0.8b \n" | 2662 "orr v2.8b, v0.8b, v0.8b \n" |
2816 MEMACCESS(2) | 2663 MEMACCESS(2) |
2817 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2664 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
2818 "b.gt 1b \n" | 2665 "b.gt 1b \n" |
2819 : "+r"(src_sobelx), // %0 | 2666 : "+r"(src_sobelx), // %0 |
2820 "+r"(src_sobely), // %1 | 2667 "+r"(src_sobely), // %1 |
2821 "+r"(dst_argb), // %2 | 2668 "+r"(dst_argb), // %2 |
2822 "+r"(width) // %3 | 2669 "+r"(width) // %3 |
2823 : | 2670 : |
2824 : "cc", "memory", "v0", "v1", "v2", "v3" | 2671 : "cc", "memory", "v0", "v1", "v2", "v3" |
2825 ); | 2672 ); |
2826 } | 2673 } |
2827 #endif // HAS_SOBELROW_NEON | |
2828 | 2674 |
2829 // Adds Sobel X and Sobel Y and stores Sobel into plane. | 2675 // Adds Sobel X and Sobel Y and stores Sobel into plane. |
2830 #ifdef HAS_SOBELTOPLANEROW_NEON | |
2831 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 2676 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
2832 uint8* dst_y, int width) { | 2677 uint8* dst_y, int width) { |
2833 asm volatile ( | 2678 asm volatile ( |
2834 // 16 pixel loop. | 2679 // 16 pixel loop. |
2835 "1: \n" | 2680 "1: \n" |
2836 MEMACCESS(0) | 2681 MEMACCESS(0) |
2837 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. | 2682 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. |
2838 MEMACCESS(1) | 2683 MEMACCESS(1) |
2839 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. | 2684 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. |
2840 "subs %w3, %w3, #16 \n" // 16 processed per loop. | 2685 "subs %w3, %w3, #16 \n" // 16 processed per loop. |
2841 "uqadd v0.16b, v0.16b, v1.16b \n" // add | 2686 "uqadd v0.16b, v0.16b, v1.16b \n" // add |
2842 MEMACCESS(2) | 2687 MEMACCESS(2) |
2843 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. | 2688 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. |
2844 "b.gt 1b \n" | 2689 "b.gt 1b \n" |
2845 : "+r"(src_sobelx), // %0 | 2690 : "+r"(src_sobelx), // %0 |
2846 "+r"(src_sobely), // %1 | 2691 "+r"(src_sobely), // %1 |
2847 "+r"(dst_y), // %2 | 2692 "+r"(dst_y), // %2 |
2848 "+r"(width) // %3 | 2693 "+r"(width) // %3 |
2849 : | 2694 : |
2850 : "cc", "memory", "v0", "v1" | 2695 : "cc", "memory", "v0", "v1" |
2851 ); | 2696 ); |
2852 } | 2697 } |
2853 #endif // HAS_SOBELTOPLANEROW_NEON | |
2854 | 2698 |
2855 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | 2699 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
2856 // A = 255 | 2700 // A = 255 |
2857 // R = Sobel X | 2701 // R = Sobel X |
2858 // G = Sobel | 2702 // G = Sobel |
2859 // B = Sobel Y | 2703 // B = Sobel Y |
2860 #ifdef HAS_SOBELXYROW_NEON | |
2861 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 2704 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
2862 uint8* dst_argb, int width) { | 2705 uint8* dst_argb, int width) { |
2863 asm volatile ( | 2706 asm volatile ( |
2864 "movi v3.8b, #255 \n" // alpha | 2707 "movi v3.8b, #255 \n" // alpha |
2865 // 8 pixel loop. | 2708 // 8 pixel loop. |
2866 "1: \n" | 2709 "1: \n" |
2867 MEMACCESS(0) | 2710 MEMACCESS(0) |
2868 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. | 2711 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. |
2869 MEMACCESS(1) | 2712 MEMACCESS(1) |
2870 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. | 2713 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. |
2871 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 2714 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
2872 "uqadd v1.8b, v0.8b, v2.8b \n" // add | 2715 "uqadd v1.8b, v0.8b, v2.8b \n" // add |
2873 MEMACCESS(2) | 2716 MEMACCESS(2) |
2874 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2717 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
2875 "b.gt 1b \n" | 2718 "b.gt 1b \n" |
2876 : "+r"(src_sobelx), // %0 | 2719 : "+r"(src_sobelx), // %0 |
2877 "+r"(src_sobely), // %1 | 2720 "+r"(src_sobely), // %1 |
2878 "+r"(dst_argb), // %2 | 2721 "+r"(dst_argb), // %2 |
2879 "+r"(width) // %3 | 2722 "+r"(width) // %3 |
2880 : | 2723 : |
2881 : "cc", "memory", "v0", "v1", "v2", "v3" | 2724 : "cc", "memory", "v0", "v1", "v2", "v3" |
2882 ); | 2725 ); |
2883 } | 2726 } |
2884 #endif // HAS_SOBELXYROW_NEON | |
2885 | 2727 |
2886 // SobelX as a matrix is | 2728 // SobelX as a matrix is |
2887 // -1 0 1 | 2729 // -1 0 1 |
2888 // -2 0 2 | 2730 // -2 0 2 |
2889 // -1 0 1 | 2731 // -1 0 1 |
2890 #ifdef HAS_SOBELXROW_NEON | |
2891 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, | 2732 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, |
2892 const uint8* src_y2, uint8* dst_sobelx, int width) { | 2733 const uint8* src_y2, uint8* dst_sobelx, int width) { |
2893 asm volatile ( | 2734 asm volatile ( |
2894 "1: \n" | 2735 "1: \n" |
2895 MEMACCESS(0) | 2736 MEMACCESS(0) |
2896 "ld1 {v0.8b}, [%0],%5 \n" // top | 2737 "ld1 {v0.8b}, [%0],%5 \n" // top |
2897 MEMACCESS(0) | 2738 MEMACCESS(0) |
2898 "ld1 {v1.8b}, [%0],%6 \n" | 2739 "ld1 {v1.8b}, [%0],%6 \n" |
2899 "usubl v0.8h, v0.8b, v1.8b \n" | 2740 "usubl v0.8h, v0.8b, v1.8b \n" |
2900 MEMACCESS(1) | 2741 MEMACCESS(1) |
(...skipping 18 matching lines...) Expand all Loading... |
2919 : "+r"(src_y0), // %0 | 2760 : "+r"(src_y0), // %0 |
2920 "+r"(src_y1), // %1 | 2761 "+r"(src_y1), // %1 |
2921 "+r"(src_y2), // %2 | 2762 "+r"(src_y2), // %2 |
2922 "+r"(dst_sobelx), // %3 | 2763 "+r"(dst_sobelx), // %3 |
2923 "+r"(width) // %4 | 2764 "+r"(width) // %4 |
2924 : "r"(2LL), // %5 | 2765 : "r"(2LL), // %5 |
2925 "r"(6LL) // %6 | 2766 "r"(6LL) // %6 |
2926 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 2767 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
2927 ); | 2768 ); |
2928 } | 2769 } |
2929 #endif // HAS_SOBELXROW_NEON | |
2930 | 2770 |
2931 // SobelY as a matrix is | 2771 // SobelY as a matrix is |
2932 // -1 -2 -1 | 2772 // -1 -2 -1 |
2933 // 0 0 0 | 2773 // 0 0 0 |
2934 // 1 2 1 | 2774 // 1 2 1 |
2935 #ifdef HAS_SOBELYROW_NEON | |
2936 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, | 2775 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, |
2937 uint8* dst_sobely, int width) { | 2776 uint8* dst_sobely, int width) { |
2938 asm volatile ( | 2777 asm volatile ( |
2939 "1: \n" | 2778 "1: \n" |
2940 MEMACCESS(0) | 2779 MEMACCESS(0) |
2941 "ld1 {v0.8b}, [%0],%4 \n" // left | 2780 "ld1 {v0.8b}, [%0],%4 \n" // left |
2942 MEMACCESS(1) | 2781 MEMACCESS(1) |
2943 "ld1 {v1.8b}, [%1],%4 \n" | 2782 "ld1 {v1.8b}, [%1],%4 \n" |
2944 "usubl v0.8h, v0.8b, v1.8b \n" | 2783 "usubl v0.8h, v0.8b, v1.8b \n" |
2945 MEMACCESS(0) | 2784 MEMACCESS(0) |
(...skipping 17 matching lines...) Expand all Loading... |
2963 "b.gt 1b \n" | 2802 "b.gt 1b \n" |
2964 : "+r"(src_y0), // %0 | 2803 : "+r"(src_y0), // %0 |
2965 "+r"(src_y1), // %1 | 2804 "+r"(src_y1), // %1 |
2966 "+r"(dst_sobely), // %2 | 2805 "+r"(dst_sobely), // %2 |
2967 "+r"(width) // %3 | 2806 "+r"(width) // %3 |
2968 : "r"(1LL), // %4 | 2807 : "r"(1LL), // %4 |
2969 "r"(6LL) // %5 | 2808 "r"(6LL) // %5 |
2970 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 2809 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
2971 ); | 2810 ); |
2972 } | 2811 } |
2973 #endif // HAS_SOBELYROW_NEON | |
2974 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 2812 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
2975 | 2813 |
2976 #ifdef __cplusplus | 2814 #ifdef __cplusplus |
2977 } // extern "C" | 2815 } // extern "C" |
2978 } // namespace libyuv | 2816 } // namespace libyuv |
2979 #endif | 2817 #endif |
OLD | NEW |