OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 160 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
171 void I444ToARGBRow_NEON(const uint8* src_y, | 171 void I444ToARGBRow_NEON(const uint8* src_y, |
172 const uint8* src_u, | 172 const uint8* src_u, |
173 const uint8* src_v, | 173 const uint8* src_v, |
174 uint8* dst_argb, | 174 uint8* dst_argb, |
175 int width) { | 175 int width) { |
176 asm volatile ( | 176 asm volatile ( |
177 YUV422TORGB_SETUP_REG | 177 YUV422TORGB_SETUP_REG |
178 "1: \n" | 178 "1: \n" |
179 READYUV444 | 179 READYUV444 |
180 YUV422TORGB(v22, v21, v20) | 180 YUV422TORGB(v22, v21, v20) |
181 "subs %4, %4, #8 \n" | 181 "subs %w4, %w4, #8 \n" |
182 "movi v23.8b, #255 \n" /* A */ | 182 "movi v23.8b, #255 \n" /* A */ |
183 MEMACCESS(3) | 183 MEMACCESS(3) |
184 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 184 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
185 "b.gt 1b \n" | 185 "b.gt 1b \n" |
186 : "+r"(src_y), // %0 | 186 : "+r"(src_y), // %0 |
187 "+r"(src_u), // %1 | 187 "+r"(src_u), // %1 |
188 "+r"(src_v), // %2 | 188 "+r"(src_v), // %2 |
189 "+r"(dst_argb), // %3 | 189 "+r"(dst_argb), // %3 |
190 "+r"(width) // %4 | 190 "+r"(width) // %4 |
191 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 191 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
192 [kYToRgb]"r"(&kYToRgb) | 192 [kYToRgb]"r"(&kYToRgb) |
193 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 193 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
194 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 194 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
195 ); | 195 ); |
196 } | 196 } |
197 #endif // HAS_I444TOARGBROW_NEON | 197 #endif // HAS_I444TOARGBROW_NEON |
198 | 198 |
199 #ifdef HAS_I422TOARGBROW_NEON | 199 #ifdef HAS_I422TOARGBROW_NEON |
200 void I422ToARGBRow_NEON(const uint8* src_y, | 200 void I422ToARGBRow_NEON(const uint8* src_y, |
201 const uint8* src_u, | 201 const uint8* src_u, |
202 const uint8* src_v, | 202 const uint8* src_v, |
203 uint8* dst_argb, | 203 uint8* dst_argb, |
204 int width) { | 204 int width) { |
205 asm volatile ( | 205 asm volatile ( |
206 YUV422TORGB_SETUP_REG | 206 YUV422TORGB_SETUP_REG |
207 "1: \n" | 207 "1: \n" |
208 READYUV422 | 208 READYUV422 |
209 YUV422TORGB(v22, v21, v20) | 209 YUV422TORGB(v22, v21, v20) |
210 "subs %4, %4, #8 \n" | 210 "subs %w4, %w4, #8 \n" |
211 "movi v23.8b, #255 \n" /* A */ | 211 "movi v23.8b, #255 \n" /* A */ |
212 MEMACCESS(3) | 212 MEMACCESS(3) |
213 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 213 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
214 "b.gt 1b \n" | 214 "b.gt 1b \n" |
215 : "+r"(src_y), // %0 | 215 : "+r"(src_y), // %0 |
216 "+r"(src_u), // %1 | 216 "+r"(src_u), // %1 |
217 "+r"(src_v), // %2 | 217 "+r"(src_v), // %2 |
218 "+r"(dst_argb), // %3 | 218 "+r"(dst_argb), // %3 |
219 "+r"(width) // %4 | 219 "+r"(width) // %4 |
220 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 220 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
221 [kYToRgb]"r"(&kYToRgb) | 221 [kYToRgb]"r"(&kYToRgb) |
222 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 222 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
223 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 223 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
224 ); | 224 ); |
225 } | 225 } |
226 #endif // HAS_I422TOARGBROW_NEON | 226 #endif // HAS_I422TOARGBROW_NEON |
227 | 227 |
228 #ifdef HAS_I411TOARGBROW_NEON | 228 #ifdef HAS_I411TOARGBROW_NEON |
229 void I411ToARGBRow_NEON(const uint8* src_y, | 229 void I411ToARGBRow_NEON(const uint8* src_y, |
230 const uint8* src_u, | 230 const uint8* src_u, |
231 const uint8* src_v, | 231 const uint8* src_v, |
232 uint8* dst_argb, | 232 uint8* dst_argb, |
233 int width) { | 233 int width) { |
234 asm volatile ( | 234 asm volatile ( |
235 YUV422TORGB_SETUP_REG | 235 YUV422TORGB_SETUP_REG |
236 "1: \n" | 236 "1: \n" |
237 READYUV411 | 237 READYUV411 |
238 YUV422TORGB(v22, v21, v20) | 238 YUV422TORGB(v22, v21, v20) |
239 "subs %4, %4, #8 \n" | 239 "subs %w4, %w4, #8 \n" |
240 "movi v23.8b, #255 \n" /* A */ | 240 "movi v23.8b, #255 \n" /* A */ |
241 MEMACCESS(3) | 241 MEMACCESS(3) |
242 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 242 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
243 "b.gt 1b \n" | 243 "b.gt 1b \n" |
244 : "+r"(src_y), // %0 | 244 : "+r"(src_y), // %0 |
245 "+r"(src_u), // %1 | 245 "+r"(src_u), // %1 |
246 "+r"(src_v), // %2 | 246 "+r"(src_v), // %2 |
247 "+r"(dst_argb), // %3 | 247 "+r"(dst_argb), // %3 |
248 "+r"(width) // %4 | 248 "+r"(width) // %4 |
249 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 249 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
250 [kYToRgb]"r"(&kYToRgb) | 250 [kYToRgb]"r"(&kYToRgb) |
251 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 251 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
252 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 252 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
253 ); | 253 ); |
254 } | 254 } |
255 #endif // HAS_I411TOARGBROW_NEON | 255 #endif // HAS_I411TOARGBROW_NEON |
256 | 256 |
257 #ifdef HAS_I422TOBGRAROW_NEON | 257 #ifdef HAS_I422TOBGRAROW_NEON |
258 void I422ToBGRARow_NEON(const uint8* src_y, | 258 void I422ToBGRARow_NEON(const uint8* src_y, |
259 const uint8* src_u, | 259 const uint8* src_u, |
260 const uint8* src_v, | 260 const uint8* src_v, |
261 uint8* dst_bgra, | 261 uint8* dst_bgra, |
262 int width) { | 262 int width) { |
263 asm volatile ( | 263 asm volatile ( |
264 YUV422TORGB_SETUP_REG | 264 YUV422TORGB_SETUP_REG |
265 "1: \n" | 265 "1: \n" |
266 READYUV422 | 266 READYUV422 |
267 YUV422TORGB(v21, v22, v23) | 267 YUV422TORGB(v21, v22, v23) |
268 "subs %4, %4, #8 \n" | 268 "subs %w4, %w4, #8 \n" |
269 "movi v20.8b, #255 \n" /* A */ | 269 "movi v20.8b, #255 \n" /* A */ |
270 MEMACCESS(3) | 270 MEMACCESS(3) |
271 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 271 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
272 "b.gt 1b \n" | 272 "b.gt 1b \n" |
273 : "+r"(src_y), // %0 | 273 : "+r"(src_y), // %0 |
274 "+r"(src_u), // %1 | 274 "+r"(src_u), // %1 |
275 "+r"(src_v), // %2 | 275 "+r"(src_v), // %2 |
276 "+r"(dst_bgra), // %3 | 276 "+r"(dst_bgra), // %3 |
277 "+r"(width) // %4 | 277 "+r"(width) // %4 |
278 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 278 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
279 [kYToRgb]"r"(&kYToRgb) | 279 [kYToRgb]"r"(&kYToRgb) |
280 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 280 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
281 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 281 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
282 ); | 282 ); |
283 } | 283 } |
284 #endif // HAS_I422TOBGRAROW_NEON | 284 #endif // HAS_I422TOBGRAROW_NEON |
285 | 285 |
286 #ifdef HAS_I422TOABGRROW_NEON | 286 #ifdef HAS_I422TOABGRROW_NEON |
287 void I422ToABGRRow_NEON(const uint8* src_y, | 287 void I422ToABGRRow_NEON(const uint8* src_y, |
288 const uint8* src_u, | 288 const uint8* src_u, |
289 const uint8* src_v, | 289 const uint8* src_v, |
290 uint8* dst_abgr, | 290 uint8* dst_abgr, |
291 int width) { | 291 int width) { |
292 asm volatile ( | 292 asm volatile ( |
293 YUV422TORGB_SETUP_REG | 293 YUV422TORGB_SETUP_REG |
294 "1: \n" | 294 "1: \n" |
295 READYUV422 | 295 READYUV422 |
296 YUV422TORGB(v20, v21, v22) | 296 YUV422TORGB(v20, v21, v22) |
297 "subs %4, %4, #8 \n" | 297 "subs %w4, %w4, #8 \n" |
298 "movi v23.8b, #255 \n" /* A */ | 298 "movi v23.8b, #255 \n" /* A */ |
299 MEMACCESS(3) | 299 MEMACCESS(3) |
300 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 300 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
301 "b.gt 1b \n" | 301 "b.gt 1b \n" |
302 : "+r"(src_y), // %0 | 302 : "+r"(src_y), // %0 |
303 "+r"(src_u), // %1 | 303 "+r"(src_u), // %1 |
304 "+r"(src_v), // %2 | 304 "+r"(src_v), // %2 |
305 "+r"(dst_abgr), // %3 | 305 "+r"(dst_abgr), // %3 |
306 "+r"(width) // %4 | 306 "+r"(width) // %4 |
307 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 307 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
308 [kYToRgb]"r"(&kYToRgb) | 308 [kYToRgb]"r"(&kYToRgb) |
309 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 309 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
310 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 310 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
311 ); | 311 ); |
312 } | 312 } |
313 #endif // HAS_I422TOABGRROW_NEON | 313 #endif // HAS_I422TOABGRROW_NEON |
314 | 314 |
315 #ifdef HAS_I422TORGBAROW_NEON | 315 #ifdef HAS_I422TORGBAROW_NEON |
316 void I422ToRGBARow_NEON(const uint8* src_y, | 316 void I422ToRGBARow_NEON(const uint8* src_y, |
317 const uint8* src_u, | 317 const uint8* src_u, |
318 const uint8* src_v, | 318 const uint8* src_v, |
319 uint8* dst_rgba, | 319 uint8* dst_rgba, |
320 int width) { | 320 int width) { |
321 asm volatile ( | 321 asm volatile ( |
322 YUV422TORGB_SETUP_REG | 322 YUV422TORGB_SETUP_REG |
323 "1: \n" | 323 "1: \n" |
324 READYUV422 | 324 READYUV422 |
325 YUV422TORGB(v23, v22, v21) | 325 YUV422TORGB(v23, v22, v21) |
326 "subs %4, %4, #8 \n" | 326 "subs %w4, %w4, #8 \n" |
327 "movi v20.8b, #255 \n" /* A */ | 327 "movi v20.8b, #255 \n" /* A */ |
328 MEMACCESS(3) | 328 MEMACCESS(3) |
329 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 329 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
330 "b.gt 1b \n" | 330 "b.gt 1b \n" |
331 : "+r"(src_y), // %0 | 331 : "+r"(src_y), // %0 |
332 "+r"(src_u), // %1 | 332 "+r"(src_u), // %1 |
333 "+r"(src_v), // %2 | 333 "+r"(src_v), // %2 |
334 "+r"(dst_rgba), // %3 | 334 "+r"(dst_rgba), // %3 |
335 "+r"(width) // %4 | 335 "+r"(width) // %4 |
336 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 336 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
337 [kYToRgb]"r"(&kYToRgb) | 337 [kYToRgb]"r"(&kYToRgb) |
338 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 338 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
339 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 339 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
340 ); | 340 ); |
341 } | 341 } |
342 #endif // HAS_I422TORGBAROW_NEON | 342 #endif // HAS_I422TORGBAROW_NEON |
343 | 343 |
344 #ifdef HAS_I422TORGB24ROW_NEON | 344 #ifdef HAS_I422TORGB24ROW_NEON |
345 void I422ToRGB24Row_NEON(const uint8* src_y, | 345 void I422ToRGB24Row_NEON(const uint8* src_y, |
346 const uint8* src_u, | 346 const uint8* src_u, |
347 const uint8* src_v, | 347 const uint8* src_v, |
348 uint8* dst_rgb24, | 348 uint8* dst_rgb24, |
349 int width) { | 349 int width) { |
350 asm volatile ( | 350 asm volatile ( |
351 YUV422TORGB_SETUP_REG | 351 YUV422TORGB_SETUP_REG |
352 "1: \n" | 352 "1: \n" |
353 READYUV422 | 353 READYUV422 |
354 YUV422TORGB(v22, v21, v20) | 354 YUV422TORGB(v22, v21, v20) |
355 "subs %4, %4, #8 \n" | 355 "subs %w4, %w4, #8 \n" |
356 MEMACCESS(3) | 356 MEMACCESS(3) |
357 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" | 357 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" |
358 "b.gt 1b \n" | 358 "b.gt 1b \n" |
359 : "+r"(src_y), // %0 | 359 : "+r"(src_y), // %0 |
360 "+r"(src_u), // %1 | 360 "+r"(src_u), // %1 |
361 "+r"(src_v), // %2 | 361 "+r"(src_v), // %2 |
362 "+r"(dst_rgb24), // %3 | 362 "+r"(dst_rgb24), // %3 |
363 "+r"(width) // %4 | 363 "+r"(width) // %4 |
364 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 364 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
365 [kYToRgb]"r"(&kYToRgb) | 365 [kYToRgb]"r"(&kYToRgb) |
366 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 366 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
367 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 367 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
368 ); | 368 ); |
369 } | 369 } |
370 #endif // HAS_I422TORGB24ROW_NEON | 370 #endif // HAS_I422TORGB24ROW_NEON |
371 | 371 |
372 #ifdef HAS_I422TORAWROW_NEON | 372 #ifdef HAS_I422TORAWROW_NEON |
373 void I422ToRAWRow_NEON(const uint8* src_y, | 373 void I422ToRAWRow_NEON(const uint8* src_y, |
374 const uint8* src_u, | 374 const uint8* src_u, |
375 const uint8* src_v, | 375 const uint8* src_v, |
376 uint8* dst_raw, | 376 uint8* dst_raw, |
377 int width) { | 377 int width) { |
378 asm volatile ( | 378 asm volatile ( |
379 YUV422TORGB_SETUP_REG | 379 YUV422TORGB_SETUP_REG |
380 "1: \n" | 380 "1: \n" |
381 READYUV422 | 381 READYUV422 |
382 YUV422TORGB(v20, v21, v22) | 382 YUV422TORGB(v20, v21, v22) |
383 "subs %4, %4, #8 \n" | 383 "subs %w4, %w4, #8 \n" |
384 MEMACCESS(3) | 384 MEMACCESS(3) |
385 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" | 385 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" |
386 "b.gt 1b \n" | 386 "b.gt 1b \n" |
387 : "+r"(src_y), // %0 | 387 : "+r"(src_y), // %0 |
388 "+r"(src_u), // %1 | 388 "+r"(src_u), // %1 |
389 "+r"(src_v), // %2 | 389 "+r"(src_v), // %2 |
390 "+r"(dst_raw), // %3 | 390 "+r"(dst_raw), // %3 |
391 "+r"(width) // %4 | 391 "+r"(width) // %4 |
392 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 392 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
393 [kYToRgb]"r"(&kYToRgb) | 393 [kYToRgb]"r"(&kYToRgb) |
(...skipping 14 matching lines...) Expand all Loading... |
408 void I422ToRGB565Row_NEON(const uint8* src_y, | 408 void I422ToRGB565Row_NEON(const uint8* src_y, |
409 const uint8* src_u, | 409 const uint8* src_u, |
410 const uint8* src_v, | 410 const uint8* src_v, |
411 uint8* dst_rgb565, | 411 uint8* dst_rgb565, |
412 int width) { | 412 int width) { |
413 asm volatile ( | 413 asm volatile ( |
414 YUV422TORGB_SETUP_REG | 414 YUV422TORGB_SETUP_REG |
415 "1: \n" | 415 "1: \n" |
416 READYUV422 | 416 READYUV422 |
417 YUV422TORGB(v22, v21, v20) | 417 YUV422TORGB(v22, v21, v20) |
418 "subs %4, %4, #8 \n" | 418 "subs %w4, %w4, #8 \n" |
419 ARGBTORGB565 | 419 ARGBTORGB565 |
420 MEMACCESS(3) | 420 MEMACCESS(3) |
421 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. | 421 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. |
422 "b.gt 1b \n" | 422 "b.gt 1b \n" |
423 : "+r"(src_y), // %0 | 423 : "+r"(src_y), // %0 |
424 "+r"(src_u), // %1 | 424 "+r"(src_u), // %1 |
425 "+r"(src_v), // %2 | 425 "+r"(src_v), // %2 |
426 "+r"(dst_rgb565), // %3 | 426 "+r"(dst_rgb565), // %3 |
427 "+r"(width) // %4 | 427 "+r"(width) // %4 |
428 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 428 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
(...skipping 17 matching lines...) Expand all Loading... |
446 void I422ToARGB1555Row_NEON(const uint8* src_y, | 446 void I422ToARGB1555Row_NEON(const uint8* src_y, |
447 const uint8* src_u, | 447 const uint8* src_u, |
448 const uint8* src_v, | 448 const uint8* src_v, |
449 uint8* dst_argb1555, | 449 uint8* dst_argb1555, |
450 int width) { | 450 int width) { |
451 asm volatile ( | 451 asm volatile ( |
452 YUV422TORGB_SETUP_REG | 452 YUV422TORGB_SETUP_REG |
453 "1: \n" | 453 "1: \n" |
454 READYUV422 | 454 READYUV422 |
455 YUV422TORGB(v22, v21, v20) | 455 YUV422TORGB(v22, v21, v20) |
456 "subs %4, %4, #8 \n" | 456 "subs %w4, %w4, #8 \n" |
457 "movi v23.8b, #255 \n" | 457 "movi v23.8b, #255 \n" |
458 ARGBTOARGB1555 | 458 ARGBTOARGB1555 |
459 MEMACCESS(3) | 459 MEMACCESS(3) |
460 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. | 460 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. |
461 "b.gt 1b \n" | 461 "b.gt 1b \n" |
462 : "+r"(src_y), // %0 | 462 : "+r"(src_y), // %0 |
463 "+r"(src_u), // %1 | 463 "+r"(src_u), // %1 |
464 "+r"(src_v), // %2 | 464 "+r"(src_v), // %2 |
465 "+r"(dst_argb1555), // %3 | 465 "+r"(dst_argb1555), // %3 |
466 "+r"(width) // %4 | 466 "+r"(width) // %4 |
(...skipping 20 matching lines...) Expand all Loading... |
487 const uint8* src_u, | 487 const uint8* src_u, |
488 const uint8* src_v, | 488 const uint8* src_v, |
489 uint8* dst_argb4444, | 489 uint8* dst_argb4444, |
490 int width) { | 490 int width) { |
491 asm volatile ( | 491 asm volatile ( |
492 YUV422TORGB_SETUP_REG | 492 YUV422TORGB_SETUP_REG |
493 "movi v4.16b, #0x0f \n" // bits to clear with vbic. | 493 "movi v4.16b, #0x0f \n" // bits to clear with vbic. |
494 "1: \n" | 494 "1: \n" |
495 READYUV422 | 495 READYUV422 |
496 YUV422TORGB(v22, v21, v20) | 496 YUV422TORGB(v22, v21, v20) |
497 "subs %4, %4, #8 \n" | 497 "subs %w4, %w4, #8 \n" |
498 "movi v23.8b, #255 \n" | 498 "movi v23.8b, #255 \n" |
499 ARGBTOARGB4444 | 499 ARGBTOARGB4444 |
500 MEMACCESS(3) | 500 MEMACCESS(3) |
501 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. | 501 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. |
502 "b.gt 1b \n" | 502 "b.gt 1b \n" |
503 : "+r"(src_y), // %0 | 503 : "+r"(src_y), // %0 |
504 "+r"(src_u), // %1 | 504 "+r"(src_u), // %1 |
505 "+r"(src_v), // %2 | 505 "+r"(src_v), // %2 |
506 "+r"(dst_argb4444), // %3 | 506 "+r"(dst_argb4444), // %3 |
507 "+r"(width) // %4 | 507 "+r"(width) // %4 |
508 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 508 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
509 [kYToRgb]"r"(&kYToRgb) | 509 [kYToRgb]"r"(&kYToRgb) |
510 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 510 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
511 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 511 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
512 ); | 512 ); |
513 } | 513 } |
514 #endif // HAS_I422TOARGB4444ROW_NEON | 514 #endif // HAS_I422TOARGB4444ROW_NEON |
515 | 515 |
516 #ifdef HAS_YTOARGBROW_NEON | 516 #ifdef HAS_I400TOARGBROW_NEON |
517 void YToARGBRow_NEON(const uint8* src_y, | 517 void I400ToARGBRow_NEON(const uint8* src_y, |
518 uint8* dst_argb, | 518 uint8* dst_argb, |
519 int width) { | 519 int width) { |
| 520 int64 width64 = (int64)(width); |
520 asm volatile ( | 521 asm volatile ( |
521 YUV422TORGB_SETUP_REG | 522 YUV422TORGB_SETUP_REG |
522 "1: \n" | 523 "1: \n" |
523 READYUV400 | 524 READYUV400 |
524 YUV422TORGB(v22, v21, v20) | 525 YUV422TORGB(v22, v21, v20) |
525 "subs %2, %2, #8 \n" | 526 "subs %w2, %w2, #8 \n" |
526 "movi v23.8b, #255 \n" | 527 "movi v23.8b, #255 \n" |
527 MEMACCESS(1) | 528 MEMACCESS(1) |
528 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 529 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
529 "b.gt 1b \n" | 530 "b.gt 1b \n" |
530 : "+r"(src_y), // %0 | 531 : "+r"(src_y), // %0 |
531 "+r"(dst_argb), // %1 | 532 "+r"(dst_argb), // %1 |
532 "+r"(width) // %2 | 533 "+r"(width64) // %2 |
533 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 534 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
534 [kYToRgb]"r"(&kYToRgb) | 535 [kYToRgb]"r"(&kYToRgb) |
535 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 536 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
536 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 537 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
537 ); | 538 ); |
538 } | 539 } |
539 #endif // HAS_YTOARGBROW_NEON | 540 #endif // HAS_I400TOARGBROW_NEON |
540 | 541 |
541 #ifdef HAS_I400TOARGBROW_NEON | 542 #ifdef HAS_J400TOARGBROW_NEON |
542 void I400ToARGBRow_NEON(const uint8* src_y, | 543 void J400ToARGBRow_NEON(const uint8* src_y, |
543 uint8* dst_argb, | 544 uint8* dst_argb, |
544 int width) { | 545 int width) { |
545 asm volatile ( | 546 asm volatile ( |
546 "movi v23.8b, #255 \n" | 547 "movi v23.8b, #255 \n" |
547 "1: \n" | 548 "1: \n" |
548 MEMACCESS(0) | 549 MEMACCESS(0) |
549 "ld1 {v20.8b}, [%0], #8 \n" | 550 "ld1 {v20.8b}, [%0], #8 \n" |
550 "orr v21.8b, v20.8b, v20.8b \n" | 551 "orr v21.8b, v20.8b, v20.8b \n" |
551 "orr v22.8b, v20.8b, v20.8b \n" | 552 "orr v22.8b, v20.8b, v20.8b \n" |
552 "subs %2, %2, #8 \n" | 553 "subs %w2, %w2, #8 \n" |
553 MEMACCESS(1) | 554 MEMACCESS(1) |
554 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 555 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
555 "b.gt 1b \n" | 556 "b.gt 1b \n" |
556 : "+r"(src_y), // %0 | 557 : "+r"(src_y), // %0 |
557 "+r"(dst_argb), // %1 | 558 "+r"(dst_argb), // %1 |
558 "+r"(width) // %2 | 559 "+r"(width) // %2 |
559 : | 560 : |
560 : "cc", "memory", "v20", "v21", "v22", "v23" | 561 : "cc", "memory", "v20", "v21", "v22", "v23" |
561 ); | 562 ); |
562 } | 563 } |
563 #endif // HAS_I400TOARGBROW_NEON | 564 #endif // HAS_J400TOARGBROW_NEON |
564 | 565 |
565 #ifdef HAS_NV12TOARGBROW_NEON | 566 #ifdef HAS_NV12TOARGBROW_NEON |
566 void NV12ToARGBRow_NEON(const uint8* src_y, | 567 void NV12ToARGBRow_NEON(const uint8* src_y, |
567 const uint8* src_uv, | 568 const uint8* src_uv, |
568 uint8* dst_argb, | 569 uint8* dst_argb, |
569 int width) { | 570 int width) { |
570 asm volatile ( | 571 asm volatile ( |
571 YUV422TORGB_SETUP_REG | 572 YUV422TORGB_SETUP_REG |
572 "1: \n" | 573 "1: \n" |
573 READNV12 | 574 READNV12 |
574 YUV422TORGB(v22, v21, v20) | 575 YUV422TORGB(v22, v21, v20) |
575 "subs %3, %3, #8 \n" | 576 "subs %w3, %w3, #8 \n" |
576 "movi v23.8b, #255 \n" | 577 "movi v23.8b, #255 \n" |
577 MEMACCESS(2) | 578 MEMACCESS(2) |
578 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" | 579 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" |
579 "b.gt 1b \n" | 580 "b.gt 1b \n" |
580 : "+r"(src_y), // %0 | 581 : "+r"(src_y), // %0 |
581 "+r"(src_uv), // %1 | 582 "+r"(src_uv), // %1 |
582 "+r"(dst_argb), // %2 | 583 "+r"(dst_argb), // %2 |
583 "+r"(width) // %3 | 584 "+r"(width) // %3 |
584 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 585 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
585 [kYToRgb]"r"(&kYToRgb) | 586 [kYToRgb]"r"(&kYToRgb) |
586 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 587 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
587 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 588 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
588 ); | 589 ); |
589 } | 590 } |
590 #endif // HAS_NV12TOARGBROW_NEON | 591 #endif // HAS_NV12TOARGBROW_NEON |
591 | 592 |
592 #ifdef HAS_NV21TOARGBROW_NEON | 593 #ifdef HAS_NV21TOARGBROW_NEON |
593 void NV21ToARGBRow_NEON(const uint8* src_y, | 594 void NV21ToARGBRow_NEON(const uint8* src_y, |
594 const uint8* src_uv, | 595 const uint8* src_uv, |
595 uint8* dst_argb, | 596 uint8* dst_argb, |
596 int width) { | 597 int width) { |
597 asm volatile ( | 598 asm volatile ( |
598 YUV422TORGB_SETUP_REG | 599 YUV422TORGB_SETUP_REG |
599 "1: \n" | 600 "1: \n" |
600 READNV21 | 601 READNV21 |
601 YUV422TORGB(v22, v21, v20) | 602 YUV422TORGB(v22, v21, v20) |
602 "subs %3, %3, #8 \n" | 603 "subs %w3, %w3, #8 \n" |
603 "movi v23.8b, #255 \n" | 604 "movi v23.8b, #255 \n" |
604 MEMACCESS(2) | 605 MEMACCESS(2) |
605 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" | 606 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" |
606 "b.gt 1b \n" | 607 "b.gt 1b \n" |
607 : "+r"(src_y), // %0 | 608 : "+r"(src_y), // %0 |
608 "+r"(src_uv), // %1 | 609 "+r"(src_uv), // %1 |
609 "+r"(dst_argb), // %2 | 610 "+r"(dst_argb), // %2 |
610 "+r"(width) // %3 | 611 "+r"(width) // %3 |
611 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 612 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
612 [kYToRgb]"r"(&kYToRgb) | 613 [kYToRgb]"r"(&kYToRgb) |
613 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 614 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
614 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 615 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
615 ); | 616 ); |
616 } | 617 } |
617 #endif // HAS_NV21TOARGBROW_NEON | 618 #endif // HAS_NV21TOARGBROW_NEON |
618 | 619 |
619 #ifdef HAS_NV12TORGB565ROW_NEON | 620 #ifdef HAS_NV12TORGB565ROW_NEON |
620 void NV12ToRGB565Row_NEON(const uint8* src_y, | 621 void NV12ToRGB565Row_NEON(const uint8* src_y, |
621 const uint8* src_uv, | 622 const uint8* src_uv, |
622 uint8* dst_rgb565, | 623 uint8* dst_rgb565, |
623 int width) { | 624 int width) { |
624 asm volatile ( | 625 asm volatile ( |
625 YUV422TORGB_SETUP_REG | 626 YUV422TORGB_SETUP_REG |
626 "1: \n" | 627 "1: \n" |
627 READNV12 | 628 READNV12 |
628 YUV422TORGB(v22, v21, v20) | 629 YUV422TORGB(v22, v21, v20) |
629 "subs %3, %3, #8 \n" | 630 "subs %w3, %w3, #8 \n" |
630 ARGBTORGB565 | 631 ARGBTORGB565 |
631 MEMACCESS(2) | 632 MEMACCESS(2) |
632 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. | 633 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. |
633 "b.gt 1b \n" | 634 "b.gt 1b \n" |
634 : "+r"(src_y), // %0 | 635 : "+r"(src_y), // %0 |
635 "+r"(src_uv), // %1 | 636 "+r"(src_uv), // %1 |
636 "+r"(dst_rgb565), // %2 | 637 "+r"(dst_rgb565), // %2 |
637 "+r"(width) // %3 | 638 "+r"(width) // %3 |
638 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 639 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
639 [kYToRgb]"r"(&kYToRgb) | 640 [kYToRgb]"r"(&kYToRgb) |
640 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 641 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
641 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 642 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
642 ); | 643 ); |
643 } | 644 } |
644 #endif // HAS_NV12TORGB565ROW_NEON | 645 #endif // HAS_NV12TORGB565ROW_NEON |
645 | 646 |
646 #ifdef HAS_NV21TORGB565ROW_NEON | 647 #ifdef HAS_NV21TORGB565ROW_NEON |
647 void NV21ToRGB565Row_NEON(const uint8* src_y, | 648 void NV21ToRGB565Row_NEON(const uint8* src_y, |
648 const uint8* src_uv, | 649 const uint8* src_uv, |
649 uint8* dst_rgb565, | 650 uint8* dst_rgb565, |
650 int width) { | 651 int width) { |
651 asm volatile ( | 652 asm volatile ( |
652 YUV422TORGB_SETUP_REG | 653 YUV422TORGB_SETUP_REG |
653 "1: \n" | 654 "1: \n" |
654 READNV21 | 655 READNV21 |
655 YUV422TORGB(v22, v21, v20) | 656 YUV422TORGB(v22, v21, v20) |
656 "subs %3, %3, #8 \n" | 657 "subs %w3, %w3, #8 \n" |
657 ARGBTORGB565 | 658 ARGBTORGB565 |
658 MEMACCESS(2) | 659 MEMACCESS(2) |
659 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. | 660 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. |
660 "b.gt 1b \n" | 661 "b.gt 1b \n" |
661 : "+r"(src_y), // %0 | 662 : "+r"(src_y), // %0 |
662 "+r"(src_uv), // %1 | 663 "+r"(src_uv), // %1 |
663 "+r"(dst_rgb565), // %2 | 664 "+r"(dst_rgb565), // %2 |
664 "+r"(width) // %3 | 665 "+r"(width) // %3 |
665 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 666 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
666 [kYToRgb]"r"(&kYToRgb) | 667 [kYToRgb]"r"(&kYToRgb) |
667 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 668 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
668 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 669 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
669 ); | 670 ); |
670 } | 671 } |
671 #endif // HAS_NV21TORGB565ROW_NEON | 672 #endif // HAS_NV21TORGB565ROW_NEON |
672 | 673 |
673 #ifdef HAS_YUY2TOARGBROW_NEON | 674 #ifdef HAS_YUY2TOARGBROW_NEON |
674 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, | 675 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, |
675 uint8* dst_argb, | 676 uint8* dst_argb, |
676 int width) { | 677 int width) { |
| 678 int64 width64 = (int64)(width); |
677 asm volatile ( | 679 asm volatile ( |
678 YUV422TORGB_SETUP_REG | 680 YUV422TORGB_SETUP_REG |
679 "1: \n" | 681 "1: \n" |
680 READYUY2 | 682 READYUY2 |
681 YUV422TORGB(v22, v21, v20) | 683 YUV422TORGB(v22, v21, v20) |
682 "subs %2, %2, #8 \n" | 684 "subs %w2, %w2, #8 \n" |
683 "movi v23.8b, #255 \n" | 685 "movi v23.8b, #255 \n" |
684 MEMACCESS(1) | 686 MEMACCESS(1) |
685 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 687 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
686 "b.gt 1b \n" | 688 "b.gt 1b \n" |
687 : "+r"(src_yuy2), // %0 | 689 : "+r"(src_yuy2), // %0 |
688 "+r"(dst_argb), // %1 | 690 "+r"(dst_argb), // %1 |
689 "+r"(width) // %2 | 691 "+r"(width64) // %2 |
690 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 692 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
691 [kYToRgb]"r"(&kYToRgb) | 693 [kYToRgb]"r"(&kYToRgb) |
692 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 694 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
693 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 695 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
694 ); | 696 ); |
695 } | 697 } |
696 #endif // HAS_YUY2TOARGBROW_NEON | 698 #endif // HAS_YUY2TOARGBROW_NEON |
697 | 699 |
698 #ifdef HAS_UYVYTOARGBROW_NEON | 700 #ifdef HAS_UYVYTOARGBROW_NEON |
699 void UYVYToARGBRow_NEON(const uint8* src_uyvy, | 701 void UYVYToARGBRow_NEON(const uint8* src_uyvy, |
700 uint8* dst_argb, | 702 uint8* dst_argb, |
701 int width) { | 703 int width) { |
| 704 int64 width64 = (int64)(width); |
702 asm volatile ( | 705 asm volatile ( |
703 YUV422TORGB_SETUP_REG | 706 YUV422TORGB_SETUP_REG |
704 "1: \n" | 707 "1: \n" |
705 READUYVY | 708 READUYVY |
706 YUV422TORGB(v22, v21, v20) | 709 YUV422TORGB(v22, v21, v20) |
707 "subs %2, %2, #8 \n" | 710 "subs %w2, %w2, #8 \n" |
708 "movi v23.8b, #255 \n" | 711 "movi v23.8b, #255 \n" |
709 MEMACCESS(1) | 712 MEMACCESS(1) |
710 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" | 713 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" |
711 "b.gt 1b \n" | 714 "b.gt 1b \n" |
712 : "+r"(src_uyvy), // %0 | 715 : "+r"(src_uyvy), // %0 |
713 "+r"(dst_argb), // %1 | 716 "+r"(dst_argb), // %1 |
714 "+r"(width) // %2 | 717 "+r"(width64) // %2 |
715 : [kUVBiasBGR]"r"(&kUVBiasBGR), | 718 : [kUVBiasBGR]"r"(&kUVBiasBGR), |
716 [kYToRgb]"r"(&kYToRgb) | 719 [kYToRgb]"r"(&kYToRgb) |
717 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 720 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
718 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 721 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
719 ); | 722 ); |
720 } | 723 } |
721 #endif // HAS_UYVYTOARGBROW_NEON | 724 #endif // HAS_UYVYTOARGBROW_NEON |
722 | 725 |
723 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. | 726 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. |
724 #ifdef HAS_SPLITUVROW_NEON | 727 #ifdef HAS_SPLITUVROW_NEON |
725 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 728 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
726 int width) { | 729 int width) { |
727 asm volatile ( | 730 asm volatile ( |
728 "1: \n" | 731 "1: \n" |
729 MEMACCESS(0) | 732 MEMACCESS(0) |
730 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV | 733 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV |
731 "subs %3, %3, #16 \n" // 16 processed per loop | 734 "subs %w3, %w3, #16 \n" // 16 processed per loop |
732 MEMACCESS(1) | 735 MEMACCESS(1) |
733 "st1 {v0.16b}, [%1], #16 \n" // store U | 736 "st1 {v0.16b}, [%1], #16 \n" // store U |
734 MEMACCESS(2) | 737 MEMACCESS(2) |
735 "st1 {v1.16b}, [%2], #16 \n" // store V | 738 "st1 {v1.16b}, [%2], #16 \n" // store V |
736 "b.gt 1b \n" | 739 "b.gt 1b \n" |
737 : "+r"(src_uv), // %0 | 740 : "+r"(src_uv), // %0 |
738 "+r"(dst_u), // %1 | 741 "+r"(dst_u), // %1 |
739 "+r"(dst_v), // %2 | 742 "+r"(dst_v), // %2 |
740 "+r"(width) // %3 // Output registers | 743 "+r"(width) // %3 // Output registers |
741 : // Input registers | 744 : // Input registers |
742 : "cc", "memory", "v0", "v1" // Clobber List | 745 : "cc", "memory", "v0", "v1" // Clobber List |
743 ); | 746 ); |
744 } | 747 } |
745 #endif // HAS_SPLITUVROW_NEON | 748 #endif // HAS_SPLITUVROW_NEON |
746 | 749 |
747 // Reads 16 U's and V's and writes out 16 pairs of UV. | 750 // Reads 16 U's and V's and writes out 16 pairs of UV. |
748 #ifdef HAS_MERGEUVROW_NEON | 751 #ifdef HAS_MERGEUVROW_NEON |
749 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 752 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
750 int width) { | 753 int width) { |
751 asm volatile ( | 754 asm volatile ( |
752 "1: \n" | 755 "1: \n" |
753 MEMACCESS(0) | 756 MEMACCESS(0) |
754 "ld1 {v0.16b}, [%0], #16 \n" // load U | 757 "ld1 {v0.16b}, [%0], #16 \n" // load U |
755 MEMACCESS(1) | 758 MEMACCESS(1) |
756 "ld1 {v1.16b}, [%1], #16 \n" // load V | 759 "ld1 {v1.16b}, [%1], #16 \n" // load V |
757 "subs %3, %3, #16 \n" // 16 processed per loop | 760 "subs %w3, %w3, #16 \n" // 16 processed per loop |
758 MEMACCESS(2) | 761 MEMACCESS(2) |
759 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV | 762 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV |
760 "b.gt 1b \n" | 763 "b.gt 1b \n" |
761 : | 764 : |
762 "+r"(src_u), // %0 | 765 "+r"(src_u), // %0 |
763 "+r"(src_v), // %1 | 766 "+r"(src_v), // %1 |
764 "+r"(dst_uv), // %2 | 767 "+r"(dst_uv), // %2 |
765 "+r"(width) // %3 // Output registers | 768 "+r"(width) // %3 // Output registers |
766 : // Input registers | 769 : // Input registers |
767 : "cc", "memory", "v0", "v1" // Clobber List | 770 : "cc", "memory", "v0", "v1" // Clobber List |
768 ); | 771 ); |
769 } | 772 } |
770 #endif // HAS_MERGEUVROW_NEON | 773 #endif // HAS_MERGEUVROW_NEON |
771 | 774 |
772 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. | 775 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. |
773 #ifdef HAS_COPYROW_NEON | 776 #ifdef HAS_COPYROW_NEON |
774 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { | 777 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { |
775 asm volatile ( | 778 asm volatile ( |
776 "1: \n" | 779 "1: \n" |
777 MEMACCESS(0) | 780 MEMACCESS(0) |
778 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 | 781 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 |
779 "subs %2, %2, #32 \n" // 32 processed per loop | 782 "subs %w2, %w2, #32 \n" // 32 processed per loop |
780 MEMACCESS(1) | 783 MEMACCESS(1) |
781 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 | 784 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 |
782 "b.gt 1b \n" | 785 "b.gt 1b \n" |
783 : "+r"(src), // %0 | 786 : "+r"(src), // %0 |
784 "+r"(dst), // %1 | 787 "+r"(dst), // %1 |
785 "+r"(count) // %2 // Output registers | 788 "+r"(count) // %2 // Output registers |
786 : // Input registers | 789 : // Input registers |
787 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 790 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
788 ); | 791 ); |
789 } | 792 } |
790 #endif // HAS_COPYROW_NEON | 793 #endif // HAS_COPYROW_NEON |
791 | 794 |
792 // SetRow writes 'count' bytes using an 8 bit value repeated. | 795 // SetRow writes 'count' bytes using an 8 bit value repeated. |
793 void SetRow_NEON(uint8* dst, uint8 v8, int count) { | 796 void SetRow_NEON(uint8* dst, uint8 v8, int count) { |
794 asm volatile ( | 797 asm volatile ( |
795 "dup v0.16b, %w2 \n" // duplicate 16 bytes | 798 "dup v0.16b, %w2 \n" // duplicate 16 bytes |
796 "1: \n" | 799 "1: \n" |
797 "subs %1, %1, #16 \n" // 16 bytes per loop | 800 "subs %w1, %w1, #16 \n" // 16 bytes per loop |
798 MEMACCESS(0) | 801 MEMACCESS(0) |
799 "st1 {v0.16b}, [%0], #16 \n" // store | 802 "st1 {v0.16b}, [%0], #16 \n" // store |
800 "b.gt 1b \n" | 803 "b.gt 1b \n" |
801 : "+r"(dst), // %0 | 804 : "+r"(dst), // %0 |
802 "+r"(count) // %1 | 805 "+r"(count) // %1 |
803 : "r"(v8) // %2 | 806 : "r"(v8) // %2 |
804 : "cc", "memory", "v0" | 807 : "cc", "memory", "v0" |
805 ); | 808 ); |
806 } | 809 } |
807 | 810 |
808 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { | 811 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { |
809 asm volatile ( | 812 asm volatile ( |
810 "dup v0.4s, %w2 \n" // duplicate 4 ints | 813 "dup v0.4s, %w2 \n" // duplicate 4 ints |
811 "1: \n" | 814 "1: \n" |
812 "subs %1, %1, #4 \n" // 4 ints per loop | 815 "subs %w1, %w1, #4 \n" // 4 ints per loop |
813 MEMACCESS(0) | 816 MEMACCESS(0) |
814 "st1 {v0.16b}, [%0], #16 \n" // store | 817 "st1 {v0.16b}, [%0], #16 \n" // store |
815 "b.gt 1b \n" | 818 "b.gt 1b \n" |
816 : "+r"(dst), // %0 | 819 : "+r"(dst), // %0 |
817 "+r"(count) // %1 | 820 "+r"(count) // %1 |
818 : "r"(v32) // %2 | 821 : "r"(v32) // %2 |
819 : "cc", "memory", "v0" | 822 : "cc", "memory", "v0" |
820 ); | 823 ); |
821 } | 824 } |
822 | 825 |
823 #ifdef HAS_MIRRORROW_NEON | 826 #ifdef HAS_MIRRORROW_NEON |
824 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 827 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
| 828 int64 width64 = (int64) width; |
825 asm volatile ( | 829 asm volatile ( |
826 // Start at end of source row. | 830 // Start at end of source row. |
827 "add %0, %0, %2 \n" | 831 "add %0, %0, %2 \n" |
828 "sub %0, %0, #16 \n" | 832 "sub %0, %0, #16 \n" |
829 | 833 |
830 "1: \n" | 834 "1: \n" |
831 MEMACCESS(0) | 835 MEMACCESS(0) |
832 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 | 836 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
833 "subs %2, %2, #16 \n" // 16 pixels per loop. | 837 "subs %2, %2, #16 \n" // 16 pixels per loop. |
834 "rev64 v0.16b, v0.16b \n" | 838 "rev64 v0.16b, v0.16b \n" |
835 MEMACCESS(1) | 839 MEMACCESS(1) |
836 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 | 840 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
837 MEMACCESS(1) | 841 MEMACCESS(1) |
838 "st1 {v0.D}[0], [%1], #8 \n" | 842 "st1 {v0.D}[0], [%1], #8 \n" |
839 "b.gt 1b \n" | 843 "b.gt 1b \n" |
840 : "+r"(src), // %0 | 844 : "+r"(src), // %0 |
841 "+r"(dst), // %1 | 845 "+r"(dst), // %1 |
842 "+r"(width) // %2 | 846 "+r"(width64) // %2 |
843 : "r"((ptrdiff_t)-16) // %3 | 847 : "r"((ptrdiff_t)-16) // %3 |
844 : "cc", "memory", "v0" | 848 : "cc", "memory", "v0" |
845 ); | 849 ); |
846 } | 850 } |
847 #endif // HAS_MIRRORROW_NEON | 851 #endif // HAS_MIRRORROW_NEON |
848 | 852 |
849 #ifdef HAS_MIRRORUVROW_NEON | 853 #ifdef HAS_MIRRORUVROW_NEON |
850 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 854 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
851 int width) { | 855 int width) { |
| 856 int64 width64 = (int64) width; |
852 asm volatile ( | 857 asm volatile ( |
853 // Start at end of source row. | 858 // Start at end of source row. |
854 "add %0, %0, %3, lsl #1 \n" | 859 "add %0, %0, %3, lsl #1 \n" |
855 "sub %0, %0, #16 \n" | 860 "sub %0, %0, #16 \n" |
856 | 861 |
857 "1: \n" | 862 "1: \n" |
858 MEMACCESS(0) | 863 MEMACCESS(0) |
859 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 | 864 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 |
860 "subs %3, %3, #8 \n" // 8 pixels per loop. | 865 "subs %3, %3, #8 \n" // 8 pixels per loop. |
861 "rev64 v0.8b, v0.8b \n" | 866 "rev64 v0.8b, v0.8b \n" |
862 "rev64 v1.8b, v1.8b \n" | 867 "rev64 v1.8b, v1.8b \n" |
863 MEMACCESS(1) | 868 MEMACCESS(1) |
864 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 | 869 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 |
865 MEMACCESS(2) | 870 MEMACCESS(2) |
866 "st1 {v1.8b}, [%2], #8 \n" | 871 "st1 {v1.8b}, [%2], #8 \n" |
867 "b.gt 1b \n" | 872 "b.gt 1b \n" |
868 : "+r"(src_uv), // %0 | 873 : "+r"(src_uv), // %0 |
869 "+r"(dst_u), // %1 | 874 "+r"(dst_u), // %1 |
870 "+r"(dst_v), // %2 | 875 "+r"(dst_v), // %2 |
871 "+r"(width) // %3 | 876 "+r"(width64) // %3 |
872 : "r"((ptrdiff_t)-16) // %4 | 877 : "r"((ptrdiff_t)-16) // %4 |
873 : "cc", "memory", "v0", "v1" | 878 : "cc", "memory", "v0", "v1" |
874 ); | 879 ); |
875 } | 880 } |
876 #endif // HAS_MIRRORUVROW_NEON | 881 #endif // HAS_MIRRORUVROW_NEON |
877 | 882 |
878 #ifdef HAS_ARGBMIRRORROW_NEON | 883 #ifdef HAS_ARGBMIRRORROW_NEON |
879 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 884 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
| 885 int64 width64 = (int64) width; |
880 asm volatile ( | 886 asm volatile ( |
881 // Start at end of source row. | 887 // Start at end of source row. |
882 "add %0, %0, %2, lsl #2 \n" | 888 "add %0, %0, %2, lsl #2 \n" |
883 "sub %0, %0, #16 \n" | 889 "sub %0, %0, #16 \n" |
884 | 890 |
885 "1: \n" | 891 "1: \n" |
886 MEMACCESS(0) | 892 MEMACCESS(0) |
887 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 | 893 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
888 "subs %2, %2, #4 \n" // 4 pixels per loop. | 894 "subs %2, %2, #4 \n" // 4 pixels per loop. |
889 "rev64 v0.4s, v0.4s \n" | 895 "rev64 v0.4s, v0.4s \n" |
890 MEMACCESS(1) | 896 MEMACCESS(1) |
891 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 | 897 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
892 MEMACCESS(1) | 898 MEMACCESS(1) |
893 "st1 {v0.D}[0], [%1], #8 \n" | 899 "st1 {v0.D}[0], [%1], #8 \n" |
894 "b.gt 1b \n" | 900 "b.gt 1b \n" |
895 : "+r"(src), // %0 | 901 : "+r"(src), // %0 |
896 "+r"(dst), // %1 | 902 "+r"(dst), // %1 |
897 "+r"(width) // %2 | 903 "+r"(width64) // %2 |
898 : "r"((ptrdiff_t)-16) // %3 | 904 : "r"((ptrdiff_t)-16) // %3 |
899 : "cc", "memory", "v0" | 905 : "cc", "memory", "v0" |
900 ); | 906 ); |
901 } | 907 } |
902 #endif // HAS_ARGBMIRRORROW_NEON | 908 #endif // HAS_ARGBMIRRORROW_NEON |
903 | 909 |
904 #ifdef HAS_RGB24TOARGBROW_NEON | 910 #ifdef HAS_RGB24TOARGBROW_NEON |
905 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 911 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
906 asm volatile ( | 912 asm volatile ( |
907 "movi v4.8b, #255 \n" // Alpha | 913 "movi v4.8b, #255 \n" // Alpha |
908 "1: \n" | 914 "1: \n" |
909 MEMACCESS(0) | 915 MEMACCESS(0) |
910 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. | 916 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. |
911 "subs %2, %2, #8 \n" // 8 processed per loop. | 917 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
912 MEMACCESS(1) | 918 MEMACCESS(1) |
913 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels | 919 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels |
914 "b.gt 1b \n" | 920 "b.gt 1b \n" |
915 : "+r"(src_rgb24), // %0 | 921 : "+r"(src_rgb24), // %0 |
916 "+r"(dst_argb), // %1 | 922 "+r"(dst_argb), // %1 |
917 "+r"(pix) // %2 | 923 "+r"(pix) // %2 |
918 : | 924 : |
919 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 925 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
920 ); | 926 ); |
921 } | 927 } |
922 #endif // HAS_RGB24TOARGBROW_NEON | 928 #endif // HAS_RGB24TOARGBROW_NEON |
923 | 929 |
924 #ifdef HAS_RAWTOARGBROW_NEON | 930 #ifdef HAS_RAWTOARGBROW_NEON |
925 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { | 931 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { |
926 asm volatile ( | 932 asm volatile ( |
927 "movi v5.8b, #255 \n" // Alpha | 933 "movi v5.8b, #255 \n" // Alpha |
928 "1: \n" | 934 "1: \n" |
929 MEMACCESS(0) | 935 MEMACCESS(0) |
930 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 936 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
931 "subs %2, %2, #8 \n" // 8 processed per loop. | 937 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
932 "orr v3.8b, v1.8b, v1.8b \n" // move g | 938 "orr v3.8b, v1.8b, v1.8b \n" // move g |
933 "orr v4.8b, v0.8b, v0.8b \n" // move r | 939 "orr v4.8b, v0.8b, v0.8b \n" // move r |
934 MEMACCESS(1) | 940 MEMACCESS(1) |
935 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a | 941 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a |
936 "b.gt 1b \n" | 942 "b.gt 1b \n" |
937 : "+r"(src_raw), // %0 | 943 : "+r"(src_raw), // %0 |
938 "+r"(dst_argb), // %1 | 944 "+r"(dst_argb), // %1 |
939 "+r"(pix) // %2 | 945 "+r"(pix) // %2 |
940 : | 946 : |
941 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List | 947 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
(...skipping 14 matching lines...) Expand all Loading... |
956 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ | 962 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ |
957 "dup v2.2D, v0.D[1] \n" /* R */ | 963 "dup v2.2D, v0.D[1] \n" /* R */ |
958 | 964 |
959 #ifdef HAS_RGB565TOARGBROW_NEON | 965 #ifdef HAS_RGB565TOARGBROW_NEON |
960 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { | 966 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { |
961 asm volatile ( | 967 asm volatile ( |
962 "movi v3.8b, #255 \n" // Alpha | 968 "movi v3.8b, #255 \n" // Alpha |
963 "1: \n" | 969 "1: \n" |
964 MEMACCESS(0) | 970 MEMACCESS(0) |
965 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. | 971 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. |
966 "subs %2, %2, #8 \n" // 8 processed per loop. | 972 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
967 RGB565TOARGB | 973 RGB565TOARGB |
968 MEMACCESS(1) | 974 MEMACCESS(1) |
969 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 975 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
970 "b.gt 1b \n" | 976 "b.gt 1b \n" |
971 : "+r"(src_rgb565), // %0 | 977 : "+r"(src_rgb565), // %0 |
972 "+r"(dst_argb), // %1 | 978 "+r"(dst_argb), // %1 |
973 "+r"(pix) // %2 | 979 "+r"(pix) // %2 |
974 : | 980 : |
975 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List | 981 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List |
976 ); | 982 ); |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1015 "dup v1.2D, v0.D[1] \n" /* G */ \ | 1021 "dup v1.2D, v0.D[1] \n" /* G */ \ |
1016 | 1022 |
1017 #ifdef HAS_ARGB1555TOARGBROW_NEON | 1023 #ifdef HAS_ARGB1555TOARGBROW_NEON |
1018 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, | 1024 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, |
1019 int pix) { | 1025 int pix) { |
1020 asm volatile ( | 1026 asm volatile ( |
1021 "movi v3.8b, #255 \n" // Alpha | 1027 "movi v3.8b, #255 \n" // Alpha |
1022 "1: \n" | 1028 "1: \n" |
1023 MEMACCESS(0) | 1029 MEMACCESS(0) |
1024 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 1030 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
1025 "subs %2, %2, #8 \n" // 8 processed per loop. | 1031 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1026 ARGB1555TOARGB | 1032 ARGB1555TOARGB |
1027 MEMACCESS(1) | 1033 MEMACCESS(1) |
1028 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 1034 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
1029 "b.gt 1b \n" | 1035 "b.gt 1b \n" |
1030 : "+r"(src_argb1555), // %0 | 1036 : "+r"(src_argb1555), // %0 |
1031 "+r"(dst_argb), // %1 | 1037 "+r"(dst_argb), // %1 |
1032 "+r"(pix) // %2 | 1038 "+r"(pix) // %2 |
1033 : | 1039 : |
1034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1040 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1035 ); | 1041 ); |
(...skipping 12 matching lines...) Expand all Loading... |
1048 "dup v0.2D, v2.D[1] \n" \ | 1054 "dup v0.2D, v2.D[1] \n" \ |
1049 "dup v1.2D, v3.D[1] \n" | 1055 "dup v1.2D, v3.D[1] \n" |
1050 | 1056 |
1051 #ifdef HAS_ARGB4444TOARGBROW_NEON | 1057 #ifdef HAS_ARGB4444TOARGBROW_NEON |
1052 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, | 1058 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, |
1053 int pix) { | 1059 int pix) { |
1054 asm volatile ( | 1060 asm volatile ( |
1055 "1: \n" | 1061 "1: \n" |
1056 MEMACCESS(0) | 1062 MEMACCESS(0) |
1057 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 1063 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
1058 "subs %2, %2, #8 \n" // 8 processed per loop. | 1064 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1059 ARGB4444TOARGB | 1065 ARGB4444TOARGB |
1060 MEMACCESS(1) | 1066 MEMACCESS(1) |
1061 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 1067 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
1062 "b.gt 1b \n" | 1068 "b.gt 1b \n" |
1063 : "+r"(src_argb4444), // %0 | 1069 : "+r"(src_argb4444), // %0 |
1064 "+r"(dst_argb), // %1 | 1070 "+r"(dst_argb), // %1 |
1065 "+r"(pix) // %2 | 1071 "+r"(pix) // %2 |
1066 : | 1072 : |
1067 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List | 1073 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List |
1068 ); | 1074 ); |
1069 } | 1075 } |
1070 #endif // HAS_ARGB4444TOARGBROW_NEON | 1076 #endif // HAS_ARGB4444TOARGBROW_NEON |
1071 | 1077 |
1072 #ifdef HAS_ARGBTORGB24ROW_NEON | 1078 #ifdef HAS_ARGBTORGB24ROW_NEON |
1073 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { | 1079 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { |
1074 asm volatile ( | 1080 asm volatile ( |
1075 "1: \n" | 1081 "1: \n" |
1076 MEMACCESS(0) | 1082 MEMACCESS(0) |
1077 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels | 1083 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels |
1078 "subs %2, %2, #8 \n" // 8 processed per loop. | 1084 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1079 MEMACCESS(1) | 1085 MEMACCESS(1) |
1080 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. | 1086 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. |
1081 "b.gt 1b \n" | 1087 "b.gt 1b \n" |
1082 : "+r"(src_argb), // %0 | 1088 : "+r"(src_argb), // %0 |
1083 "+r"(dst_rgb24), // %1 | 1089 "+r"(dst_rgb24), // %1 |
1084 "+r"(pix) // %2 | 1090 "+r"(pix) // %2 |
1085 : | 1091 : |
1086 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 1092 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
1087 ); | 1093 ); |
1088 } | 1094 } |
1089 #endif // HAS_ARGBTORGB24ROW_NEON | 1095 #endif // HAS_ARGBTORGB24ROW_NEON |
1090 | 1096 |
1091 #ifdef HAS_ARGBTORAWROW_NEON | 1097 #ifdef HAS_ARGBTORAWROW_NEON |
1092 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { | 1098 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { |
1093 asm volatile ( | 1099 asm volatile ( |
1094 "1: \n" | 1100 "1: \n" |
1095 MEMACCESS(0) | 1101 MEMACCESS(0) |
1096 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a | 1102 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a |
1097 "subs %2, %2, #8 \n" // 8 processed per loop. | 1103 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1098 "orr v4.8b, v2.8b, v2.8b \n" // mov g | 1104 "orr v4.8b, v2.8b, v2.8b \n" // mov g |
1099 "orr v5.8b, v1.8b, v1.8b \n" // mov b | 1105 "orr v5.8b, v1.8b, v1.8b \n" // mov b |
1100 MEMACCESS(1) | 1106 MEMACCESS(1) |
1101 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b | 1107 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b |
1102 "b.gt 1b \n" | 1108 "b.gt 1b \n" |
1103 : "+r"(src_argb), // %0 | 1109 : "+r"(src_argb), // %0 |
1104 "+r"(dst_raw), // %1 | 1110 "+r"(dst_raw), // %1 |
1105 "+r"(pix) // %2 | 1111 "+r"(pix) // %2 |
1106 : | 1112 : |
1107 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List | 1113 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List |
1108 ); | 1114 ); |
1109 } | 1115 } |
1110 #endif // HAS_ARGBTORAWROW_NEON | 1116 #endif // HAS_ARGBTORAWROW_NEON |
1111 | 1117 |
1112 #ifdef HAS_YUY2TOYROW_NEON | 1118 #ifdef HAS_YUY2TOYROW_NEON |
1113 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { | 1119 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { |
1114 asm volatile ( | 1120 asm volatile ( |
1115 "1: \n" | 1121 "1: \n" |
1116 MEMACCESS(0) | 1122 MEMACCESS(0) |
1117 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. | 1123 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. |
1118 "subs %2, %2, #16 \n" // 16 processed per loop. | 1124 "subs %w2, %w2, #16 \n" // 16 processed per loop. |
1119 MEMACCESS(1) | 1125 MEMACCESS(1) |
1120 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. | 1126 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. |
1121 "b.gt 1b \n" | 1127 "b.gt 1b \n" |
1122 : "+r"(src_yuy2), // %0 | 1128 : "+r"(src_yuy2), // %0 |
1123 "+r"(dst_y), // %1 | 1129 "+r"(dst_y), // %1 |
1124 "+r"(pix) // %2 | 1130 "+r"(pix) // %2 |
1125 : | 1131 : |
1126 : "cc", "memory", "v0", "v1" // Clobber List | 1132 : "cc", "memory", "v0", "v1" // Clobber List |
1127 ); | 1133 ); |
1128 } | 1134 } |
1129 #endif // HAS_YUY2TOYROW_NEON | 1135 #endif // HAS_YUY2TOYROW_NEON |
1130 | 1136 |
1131 #ifdef HAS_UYVYTOYROW_NEON | 1137 #ifdef HAS_UYVYTOYROW_NEON |
1132 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { | 1138 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { |
1133 asm volatile ( | 1139 asm volatile ( |
1134 "1: \n" | 1140 "1: \n" |
1135 MEMACCESS(0) | 1141 MEMACCESS(0) |
1136 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. | 1142 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. |
1137 "subs %2, %2, #16 \n" // 16 processed per loop. | 1143 "subs %w2, %w2, #16 \n" // 16 processed per loop. |
1138 MEMACCESS(1) | 1144 MEMACCESS(1) |
1139 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. | 1145 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. |
1140 "b.gt 1b \n" | 1146 "b.gt 1b \n" |
1141 : "+r"(src_uyvy), // %0 | 1147 : "+r"(src_uyvy), // %0 |
1142 "+r"(dst_y), // %1 | 1148 "+r"(dst_y), // %1 |
1143 "+r"(pix) // %2 | 1149 "+r"(pix) // %2 |
1144 : | 1150 : |
1145 : "cc", "memory", "v0", "v1" // Clobber List | 1151 : "cc", "memory", "v0", "v1" // Clobber List |
1146 ); | 1152 ); |
1147 } | 1153 } |
1148 #endif // HAS_UYVYTOYROW_NEON | 1154 #endif // HAS_UYVYTOYROW_NEON |
1149 | 1155 |
1150 #ifdef HAS_YUY2TOUV422ROW_NEON | 1156 #ifdef HAS_YUY2TOUV422ROW_NEON |
1151 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, | 1157 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, |
1152 int pix) { | 1158 int pix) { |
1153 asm volatile ( | 1159 asm volatile ( |
1154 "1: \n" | 1160 "1: \n" |
1155 MEMACCESS(0) | 1161 MEMACCESS(0) |
1156 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels | 1162 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels |
1157 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. | 1163 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
1158 MEMACCESS(1) | 1164 MEMACCESS(1) |
1159 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. | 1165 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. |
1160 MEMACCESS(2) | 1166 MEMACCESS(2) |
1161 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. | 1167 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. |
1162 "b.gt 1b \n" | 1168 "b.gt 1b \n" |
1163 : "+r"(src_yuy2), // %0 | 1169 : "+r"(src_yuy2), // %0 |
1164 "+r"(dst_u), // %1 | 1170 "+r"(dst_u), // %1 |
1165 "+r"(dst_v), // %2 | 1171 "+r"(dst_v), // %2 |
1166 "+r"(pix) // %3 | 1172 "+r"(pix) // %3 |
1167 : | 1173 : |
1168 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1174 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1169 ); | 1175 ); |
1170 } | 1176 } |
1171 #endif // HAS_YUY2TOUV422ROW_NEON | 1177 #endif // HAS_YUY2TOUV422ROW_NEON |
1172 | 1178 |
1173 #ifdef HAS_UYVYTOUV422ROW_NEON | 1179 #ifdef HAS_UYVYTOUV422ROW_NEON |
1174 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, | 1180 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, |
1175 int pix) { | 1181 int pix) { |
1176 asm volatile ( | 1182 asm volatile ( |
1177 "1: \n" | 1183 "1: \n" |
1178 MEMACCESS(0) | 1184 MEMACCESS(0) |
1179 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels | 1185 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels |
1180 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. | 1186 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
1181 MEMACCESS(1) | 1187 MEMACCESS(1) |
1182 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. | 1188 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. |
1183 MEMACCESS(2) | 1189 MEMACCESS(2) |
1184 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. | 1190 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. |
1185 "b.gt 1b \n" | 1191 "b.gt 1b \n" |
1186 : "+r"(src_uyvy), // %0 | 1192 : "+r"(src_uyvy), // %0 |
1187 "+r"(dst_u), // %1 | 1193 "+r"(dst_u), // %1 |
1188 "+r"(dst_v), // %2 | 1194 "+r"(dst_v), // %2 |
1189 "+r"(pix) // %3 | 1195 "+r"(pix) // %3 |
1190 : | 1196 : |
1191 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1197 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
1192 ); | 1198 ); |
1193 } | 1199 } |
1194 #endif // HAS_UYVYTOUV422ROW_NEON | 1200 #endif // HAS_UYVYTOUV422ROW_NEON |
1195 | 1201 |
1196 #ifdef HAS_YUY2TOUVROW_NEON | 1202 #ifdef HAS_YUY2TOUVROW_NEON |
1197 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, | 1203 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, |
1198 uint8* dst_u, uint8* dst_v, int pix) { | 1204 uint8* dst_u, uint8* dst_v, int pix) { |
1199 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; | 1205 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; |
1200 asm volatile ( | 1206 asm volatile ( |
1201 "1: \n" | 1207 "1: \n" |
1202 MEMACCESS(0) | 1208 MEMACCESS(0) |
1203 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels | 1209 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
1204 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. | 1210 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
1205 MEMACCESS(1) | 1211 MEMACCESS(1) |
1206 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row | 1212 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
1207 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U | 1213 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U |
1208 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V | 1214 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V |
1209 MEMACCESS(2) | 1215 MEMACCESS(2) |
1210 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. | 1216 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. |
1211 MEMACCESS(3) | 1217 MEMACCESS(3) |
1212 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. | 1218 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. |
1213 "b.gt 1b \n" | 1219 "b.gt 1b \n" |
1214 : "+r"(src_yuy2), // %0 | 1220 : "+r"(src_yuy2), // %0 |
1215 "+r"(src_yuy2b), // %1 | 1221 "+r"(src_yuy2b), // %1 |
1216 "+r"(dst_u), // %2 | 1222 "+r"(dst_u), // %2 |
1217 "+r"(dst_v), // %3 | 1223 "+r"(dst_v), // %3 |
1218 "+r"(pix) // %4 | 1224 "+r"(pix) // %4 |
1219 : | 1225 : |
1220 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1226 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
1221 "v5", "v6", "v7" // Clobber List | 1227 "v5", "v6", "v7" // Clobber List |
1222 ); | 1228 ); |
1223 } | 1229 } |
1224 #endif // HAS_YUY2TOUVROW_NEON | 1230 #endif // HAS_YUY2TOUVROW_NEON |
1225 | 1231 |
1226 #ifdef HAS_UYVYTOUVROW_NEON | 1232 #ifdef HAS_UYVYTOUVROW_NEON |
1227 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, | 1233 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, |
1228 uint8* dst_u, uint8* dst_v, int pix) { | 1234 uint8* dst_u, uint8* dst_v, int pix) { |
1229 const uint8* src_uyvyb = src_uyvy + stride_uyvy; | 1235 const uint8* src_uyvyb = src_uyvy + stride_uyvy; |
1230 asm volatile ( | 1236 asm volatile ( |
1231 "1: \n" | 1237 "1: \n" |
1232 MEMACCESS(0) | 1238 MEMACCESS(0) |
1233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels | 1239 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
1234 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. | 1240 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
1235 MEMACCESS(1) | 1241 MEMACCESS(1) |
1236 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row | 1242 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
1237 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U | 1243 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U |
1238 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V | 1244 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V |
1239 MEMACCESS(2) | 1245 MEMACCESS(2) |
1240 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. | 1246 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. |
1241 MEMACCESS(3) | 1247 MEMACCESS(3) |
1242 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. | 1248 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. |
1243 "b.gt 1b \n" | 1249 "b.gt 1b \n" |
1244 : "+r"(src_uyvy), // %0 | 1250 : "+r"(src_uyvy), // %0 |
1245 "+r"(src_uyvyb), // %1 | 1251 "+r"(src_uyvyb), // %1 |
1246 "+r"(dst_u), // %2 | 1252 "+r"(dst_u), // %2 |
1247 "+r"(dst_v), // %3 | 1253 "+r"(dst_v), // %3 |
1248 "+r"(pix) // %4 | 1254 "+r"(pix) // %4 |
1249 : | 1255 : |
1250 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1256 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
1251 "v5", "v6", "v7" // Clobber List | 1257 "v5", "v6", "v7" // Clobber List |
1252 ); | 1258 ); |
1253 } | 1259 } |
1254 #endif // HAS_UYVYTOUVROW_NEON | 1260 #endif // HAS_UYVYTOUVROW_NEON |
1255 | 1261 |
1256 // Select G channels from ARGB. e.g. GGGGGGGG | |
1257 #ifdef HAS_ARGBTOBAYERGGROW_NEON | |
1258 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, | |
1259 uint32 /*selector*/, int pix) { | |
1260 asm volatile ( | |
1261 "1: \n" | |
1262 MEMACCESS(0) | |
1263 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels | |
1264 "subs %2, %2, #8 \n" // 8 processed per loop | |
1265 MEMACCESS(1) | |
1266 "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. | |
1267 "b.gt 1b \n" | |
1268 : "+r"(src_argb), // %0 | |
1269 "+r"(dst_bayer), // %1 | |
1270 "+r"(pix) // %2 | |
1271 : | |
1272 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | |
1273 ); | |
1274 } | |
1275 #endif // HAS_ARGBTOBAYERGGROW_NEON | |
1276 | |
1277 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 1262 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
1278 #ifdef HAS_ARGBSHUFFLEROW_NEON | 1263 #ifdef HAS_ARGBSHUFFLEROW_NEON |
1279 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, | 1264 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, |
1280 const uint8* shuffler, int pix) { | 1265 const uint8* shuffler, int pix) { |
1281 asm volatile ( | 1266 asm volatile ( |
1282 MEMACCESS(3) | 1267 MEMACCESS(3) |
1283 "ld1 {v2.16b}, [%3] \n" // shuffler | 1268 "ld1 {v2.16b}, [%3] \n" // shuffler |
1284 "1: \n" | 1269 "1: \n" |
1285 MEMACCESS(0) | 1270 MEMACCESS(0) |
1286 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. | 1271 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. |
1287 "subs %2, %2, #4 \n" // 4 processed per loop | 1272 "subs %w2, %w2, #4 \n" // 4 processed per loop |
1288 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels | 1273 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels |
1289 MEMACCESS(1) | 1274 MEMACCESS(1) |
1290 "st1 {v1.16b}, [%1], #16 \n" // store 4. | 1275 "st1 {v1.16b}, [%1], #16 \n" // store 4. |
1291 "b.gt 1b \n" | 1276 "b.gt 1b \n" |
1292 : "+r"(src_argb), // %0 | 1277 : "+r"(src_argb), // %0 |
1293 "+r"(dst_argb), // %1 | 1278 "+r"(dst_argb), // %1 |
1294 "+r"(pix) // %2 | 1279 "+r"(pix) // %2 |
1295 : "r"(shuffler) // %3 | 1280 : "r"(shuffler) // %3 |
1296 : "cc", "memory", "v0", "v1", "v2" // Clobber List | 1281 : "cc", "memory", "v0", "v1", "v2" // Clobber List |
1297 ); | 1282 ); |
1298 } | 1283 } |
1299 #endif // HAS_ARGBSHUFFLEROW_NEON | 1284 #endif // HAS_ARGBSHUFFLEROW_NEON |
1300 | 1285 |
1301 #ifdef HAS_I422TOYUY2ROW_NEON | 1286 #ifdef HAS_I422TOYUY2ROW_NEON |
1302 void I422ToYUY2Row_NEON(const uint8* src_y, | 1287 void I422ToYUY2Row_NEON(const uint8* src_y, |
1303 const uint8* src_u, | 1288 const uint8* src_u, |
1304 const uint8* src_v, | 1289 const uint8* src_v, |
1305 uint8* dst_yuy2, int width) { | 1290 uint8* dst_yuy2, int width) { |
1306 asm volatile ( | 1291 asm volatile ( |
1307 "1: \n" | 1292 "1: \n" |
1308 MEMACCESS(0) | 1293 MEMACCESS(0) |
1309 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys | 1294 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys |
1310 "orr v2.8b, v1.8b, v1.8b \n" | 1295 "orr v2.8b, v1.8b, v1.8b \n" |
1311 MEMACCESS(1) | 1296 MEMACCESS(1) |
1312 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us | 1297 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us |
1313 MEMACCESS(2) | 1298 MEMACCESS(2) |
1314 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs | 1299 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs |
1315 "subs %4, %4, #16 \n" // 16 pixels | 1300 "subs %w4, %w4, #16 \n" // 16 pixels |
1316 MEMACCESS(3) | 1301 MEMACCESS(3) |
1317 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. | 1302 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. |
1318 "b.gt 1b \n" | 1303 "b.gt 1b \n" |
1319 : "+r"(src_y), // %0 | 1304 : "+r"(src_y), // %0 |
1320 "+r"(src_u), // %1 | 1305 "+r"(src_u), // %1 |
1321 "+r"(src_v), // %2 | 1306 "+r"(src_v), // %2 |
1322 "+r"(dst_yuy2), // %3 | 1307 "+r"(dst_yuy2), // %3 |
1323 "+r"(width) // %4 | 1308 "+r"(width) // %4 |
1324 : | 1309 : |
1325 : "cc", "memory", "v0", "v1", "v2", "v3" | 1310 : "cc", "memory", "v0", "v1", "v2", "v3" |
1326 ); | 1311 ); |
1327 } | 1312 } |
1328 #endif // HAS_I422TOYUY2ROW_NEON | 1313 #endif // HAS_I422TOYUY2ROW_NEON |
1329 | 1314 |
1330 #ifdef HAS_I422TOUYVYROW_NEON | 1315 #ifdef HAS_I422TOUYVYROW_NEON |
1331 void I422ToUYVYRow_NEON(const uint8* src_y, | 1316 void I422ToUYVYRow_NEON(const uint8* src_y, |
1332 const uint8* src_u, | 1317 const uint8* src_u, |
1333 const uint8* src_v, | 1318 const uint8* src_v, |
1334 uint8* dst_uyvy, int width) { | 1319 uint8* dst_uyvy, int width) { |
1335 asm volatile ( | 1320 asm volatile ( |
1336 "1: \n" | 1321 "1: \n" |
1337 MEMACCESS(0) | 1322 MEMACCESS(0) |
1338 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys | 1323 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys |
1339 "orr v3.8b, v2.8b, v2.8b \n" | 1324 "orr v3.8b, v2.8b, v2.8b \n" |
1340 MEMACCESS(1) | 1325 MEMACCESS(1) |
1341 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us | 1326 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us |
1342 MEMACCESS(2) | 1327 MEMACCESS(2) |
1343 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs | 1328 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs |
1344 "subs %4, %4, #16 \n" // 16 pixels | 1329 "subs %w4, %w4, #16 \n" // 16 pixels |
1345 MEMACCESS(3) | 1330 MEMACCESS(3) |
1346 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. | 1331 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. |
1347 "b.gt 1b \n" | 1332 "b.gt 1b \n" |
1348 : "+r"(src_y), // %0 | 1333 : "+r"(src_y), // %0 |
1349 "+r"(src_u), // %1 | 1334 "+r"(src_u), // %1 |
1350 "+r"(src_v), // %2 | 1335 "+r"(src_v), // %2 |
1351 "+r"(dst_uyvy), // %3 | 1336 "+r"(dst_uyvy), // %3 |
1352 "+r"(width) // %4 | 1337 "+r"(width) // %4 |
1353 : | 1338 : |
1354 : "cc", "memory", "v0", "v1", "v2", "v3" | 1339 : "cc", "memory", "v0", "v1", "v2", "v3" |
1355 ); | 1340 ); |
1356 } | 1341 } |
1357 #endif // HAS_I422TOUYVYROW_NEON | 1342 #endif // HAS_I422TOUYVYROW_NEON |
1358 | 1343 |
1359 #ifdef HAS_ARGBTORGB565ROW_NEON | 1344 #ifdef HAS_ARGBTORGB565ROW_NEON |
1360 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { | 1345 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { |
1361 asm volatile ( | 1346 asm volatile ( |
1362 "1: \n" | 1347 "1: \n" |
1363 MEMACCESS(0) | 1348 MEMACCESS(0) |
1364 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1349 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
1365 "subs %2, %2, #8 \n" // 8 processed per loop. | 1350 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1366 ARGBTORGB565 | 1351 ARGBTORGB565 |
1367 MEMACCESS(1) | 1352 MEMACCESS(1) |
1368 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. | 1353 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. |
1369 "b.gt 1b \n" | 1354 "b.gt 1b \n" |
1370 : "+r"(src_argb), // %0 | 1355 : "+r"(src_argb), // %0 |
1371 "+r"(dst_rgb565), // %1 | 1356 "+r"(dst_rgb565), // %1 |
1372 "+r"(pix) // %2 | 1357 "+r"(pix) // %2 |
1373 : | 1358 : |
1374 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 1359 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" |
1375 ); | 1360 ); |
1376 } | 1361 } |
1377 #endif // HAS_ARGBTORGB565ROW_NEON | 1362 #endif // HAS_ARGBTORGB565ROW_NEON |
1378 | 1363 |
| 1364 #ifdef HAS_ARGBTORGB565DITHERROW_NEON |
| 1365 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, |
| 1366 const uint32 dither4, int width) { |
| 1367 asm volatile ( |
| 1368 "dup v1.4s, %w2 \n" // dither4 |
| 1369 "1: \n" |
| 1370 MEMACCESS(1) |
| 1371 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels |
| 1372 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
| 1373 "uqadd v20.8b, v20.8b, v1.8b \n" |
| 1374 "uqadd v21.8b, v21.8b, v1.8b \n" |
| 1375 "uqadd v22.8b, v22.8b, v1.8b \n" |
| 1376 ARGBTORGB565 |
| 1377 MEMACCESS(0) |
| 1378 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. |
| 1379 "b.gt 1b \n" |
| 1380 : "+r"(dst_rgb) // %0 |
| 1381 : "r"(src_argb), // %1 |
| 1382 "r"(dither4), // %2 |
| 1383 "r"(width) // %3 |
| 1384 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" |
| 1385 ); |
| 1386 } |
| 1387 #endif // HAS_ARGBTORGB565ROW_NEON |
| 1388 |
1379 #ifdef HAS_ARGBTOARGB1555ROW_NEON | 1389 #ifdef HAS_ARGBTOARGB1555ROW_NEON |
1380 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, | 1390 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, |
1381 int pix) { | 1391 int pix) { |
1382 asm volatile ( | 1392 asm volatile ( |
1383 "1: \n" | 1393 "1: \n" |
1384 MEMACCESS(0) | 1394 MEMACCESS(0) |
1385 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1395 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
1386 "subs %2, %2, #8 \n" // 8 processed per loop. | 1396 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1387 ARGBTOARGB1555 | 1397 ARGBTOARGB1555 |
1388 MEMACCESS(1) | 1398 MEMACCESS(1) |
1389 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. | 1399 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. |
1390 "b.gt 1b \n" | 1400 "b.gt 1b \n" |
1391 : "+r"(src_argb), // %0 | 1401 : "+r"(src_argb), // %0 |
1392 "+r"(dst_argb1555), // %1 | 1402 "+r"(dst_argb1555), // %1 |
1393 "+r"(pix) // %2 | 1403 "+r"(pix) // %2 |
1394 : | 1404 : |
1395 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 1405 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" |
1396 ); | 1406 ); |
1397 } | 1407 } |
1398 #endif // HAS_ARGBTOARGB1555ROW_NEON | 1408 #endif // HAS_ARGBTOARGB1555ROW_NEON |
1399 | 1409 |
1400 #ifdef HAS_ARGBTOARGB4444ROW_NEON | 1410 #ifdef HAS_ARGBTOARGB4444ROW_NEON |
1401 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, | 1411 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, |
1402 int pix) { | 1412 int pix) { |
1403 asm volatile ( | 1413 asm volatile ( |
1404 "movi v4.16b, #0x0f \n" // bits to clear with vbic. | 1414 "movi v4.16b, #0x0f \n" // bits to clear with vbic. |
1405 "1: \n" | 1415 "1: \n" |
1406 MEMACCESS(0) | 1416 MEMACCESS(0) |
1407 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1417 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
1408 "subs %2, %2, #8 \n" // 8 processed per loop. | 1418 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1409 ARGBTOARGB4444 | 1419 ARGBTOARGB4444 |
1410 MEMACCESS(1) | 1420 MEMACCESS(1) |
1411 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. | 1421 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. |
1412 "b.gt 1b \n" | 1422 "b.gt 1b \n" |
1413 : "+r"(src_argb), // %0 | 1423 : "+r"(src_argb), // %0 |
1414 "+r"(dst_argb4444), // %1 | 1424 "+r"(dst_argb4444), // %1 |
1415 "+r"(pix) // %2 | 1425 "+r"(pix) // %2 |
1416 : | 1426 : |
1417 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" | 1427 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" |
1418 ); | 1428 ); |
1419 } | 1429 } |
1420 #endif // HAS_ARGBTOARGB4444ROW_NEON | 1430 #endif // HAS_ARGBTOARGB4444ROW_NEON |
1421 | 1431 |
1422 #ifdef HAS_ARGBTOYROW_NEON | 1432 #ifdef HAS_ARGBTOYROW_NEON |
1423 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 1433 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { |
1424 asm volatile ( | 1434 asm volatile ( |
1425 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 1435 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
1426 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 1436 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
1427 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 1437 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
1428 "movi v7.8b, #16 \n" // Add 16 constant | 1438 "movi v7.8b, #16 \n" // Add 16 constant |
1429 "1: \n" | 1439 "1: \n" |
1430 MEMACCESS(0) | 1440 MEMACCESS(0) |
1431 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1441 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
1432 "subs %2, %2, #8 \n" // 8 processed per loop. | 1442 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1433 "umull v3.8h, v0.8b, v4.8b \n" // B | 1443 "umull v3.8h, v0.8b, v4.8b \n" // B |
1434 "umlal v3.8h, v1.8b, v5.8b \n" // G | 1444 "umlal v3.8h, v1.8b, v5.8b \n" // G |
1435 "umlal v3.8h, v2.8b, v6.8b \n" // R | 1445 "umlal v3.8h, v2.8b, v6.8b \n" // R |
1436 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 1446 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
1437 "uqadd v0.8b, v0.8b, v7.8b \n" | 1447 "uqadd v0.8b, v0.8b, v7.8b \n" |
1438 MEMACCESS(1) | 1448 MEMACCESS(1) |
1439 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1449 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
1440 "b.gt 1b \n" | 1450 "b.gt 1b \n" |
1441 : "+r"(src_argb), // %0 | 1451 : "+r"(src_argb), // %0 |
1442 "+r"(dst_y), // %1 | 1452 "+r"(dst_y), // %1 |
1443 "+r"(pix) // %2 | 1453 "+r"(pix) // %2 |
1444 : | 1454 : |
1445 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 1455 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
1446 ); | 1456 ); |
1447 } | 1457 } |
1448 #endif // HAS_ARGBTOYROW_NEON | 1458 #endif // HAS_ARGBTOYROW_NEON |
1449 | 1459 |
1450 #ifdef HAS_ARGBTOYJROW_NEON | 1460 #ifdef HAS_ARGBTOYJROW_NEON |
1451 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { | 1461 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { |
1452 asm volatile ( | 1462 asm volatile ( |
1453 "movi v4.8b, #15 \n" // B * 0.11400 coefficient | 1463 "movi v4.8b, #15 \n" // B * 0.11400 coefficient |
1454 "movi v5.8b, #75 \n" // G * 0.58700 coefficient | 1464 "movi v5.8b, #75 \n" // G * 0.58700 coefficient |
1455 "movi v6.8b, #38 \n" // R * 0.29900 coefficient | 1465 "movi v6.8b, #38 \n" // R * 0.29900 coefficient |
1456 "1: \n" | 1466 "1: \n" |
1457 MEMACCESS(0) | 1467 MEMACCESS(0) |
1458 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1468 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
1459 "subs %2, %2, #8 \n" // 8 processed per loop. | 1469 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
1460 "umull v3.8h, v0.8b, v4.8b \n" // B | 1470 "umull v3.8h, v0.8b, v4.8b \n" // B |
1461 "umlal v3.8h, v1.8b, v5.8b \n" // G | 1471 "umlal v3.8h, v1.8b, v5.8b \n" // G |
1462 "umlal v3.8h, v2.8b, v6.8b \n" // R | 1472 "umlal v3.8h, v2.8b, v6.8b \n" // R |
1463 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y | 1473 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y |
1464 MEMACCESS(1) | 1474 MEMACCESS(1) |
1465 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1475 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
1466 "b.gt 1b \n" | 1476 "b.gt 1b \n" |
1467 : "+r"(src_argb), // %0 | 1477 : "+r"(src_argb), // %0 |
1468 "+r"(dst_y), // %1 | 1478 "+r"(dst_y), // %1 |
1469 "+r"(pix) // %2 | 1479 "+r"(pix) // %2 |
(...skipping 10 matching lines...) Expand all Loading... |
1480 asm volatile ( | 1490 asm volatile ( |
1481 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient | 1491 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient |
1482 "movi v25.8b, #74 \n" // UG -0.5781 coefficient | 1492 "movi v25.8b, #74 \n" // UG -0.5781 coefficient |
1483 "movi v26.8b, #38 \n" // UR -0.2969 coefficient | 1493 "movi v26.8b, #38 \n" // UR -0.2969 coefficient |
1484 "movi v27.8b, #18 \n" // VB -0.1406 coefficient | 1494 "movi v27.8b, #18 \n" // VB -0.1406 coefficient |
1485 "movi v28.8b, #94 \n" // VG -0.7344 coefficient | 1495 "movi v28.8b, #94 \n" // VG -0.7344 coefficient |
1486 "movi v29.16b,#0x80 \n" // 128.5 | 1496 "movi v29.16b,#0x80 \n" // 128.5 |
1487 "1: \n" | 1497 "1: \n" |
1488 MEMACCESS(0) | 1498 MEMACCESS(0) |
1489 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1499 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
1490 "subs %3, %3, #8 \n" // 8 processed per loop. | 1500 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
1491 "umull v4.8h, v0.8b, v24.8b \n" // B | 1501 "umull v4.8h, v0.8b, v24.8b \n" // B |
1492 "umlsl v4.8h, v1.8b, v25.8b \n" // G | 1502 "umlsl v4.8h, v1.8b, v25.8b \n" // G |
1493 "umlsl v4.8h, v2.8b, v26.8b \n" // R | 1503 "umlsl v4.8h, v2.8b, v26.8b \n" // R |
1494 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned | 1504 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned |
1495 | 1505 |
1496 "umull v3.8h, v2.8b, v24.8b \n" // R | 1506 "umull v3.8h, v2.8b, v24.8b \n" // R |
1497 "umlsl v3.8h, v1.8b, v28.8b \n" // G | 1507 "umlsl v3.8h, v1.8b, v28.8b \n" // G |
1498 "umlsl v3.8h, v0.8b, v27.8b \n" // B | 1508 "umlsl v3.8h, v0.8b, v27.8b \n" // B |
1499 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned | 1509 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned |
1500 | 1510 |
(...skipping 23 matching lines...) Expand all Loading... |
1524 asm volatile ( | 1534 asm volatile ( |
1525 RGBTOUV_SETUP_REG | 1535 RGBTOUV_SETUP_REG |
1526 "1: \n" | 1536 "1: \n" |
1527 MEMACCESS(0) | 1537 MEMACCESS(0) |
1528 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1538 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
1529 | 1539 |
1530 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1540 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
1531 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1541 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
1532 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1542 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
1533 | 1543 |
1534 "subs %3, %3, #16 \n" // 16 processed per loop. | 1544 "subs %w3, %w3, #16 \n" // 16 processed per loop. |
1535 "mul v3.8h, v0.8h, v20.8h \n" // B | 1545 "mul v3.8h, v0.8h, v20.8h \n" // B |
1536 "mls v3.8h, v1.8h, v21.8h \n" // G | 1546 "mls v3.8h, v1.8h, v21.8h \n" // G |
1537 "mls v3.8h, v2.8h, v22.8h \n" // R | 1547 "mls v3.8h, v2.8h, v22.8h \n" // R |
1538 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned | 1548 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned |
1539 | 1549 |
1540 "mul v4.8h, v2.8h, v20.8h \n" // R | 1550 "mul v4.8h, v2.8h, v20.8h \n" // R |
1541 "mls v4.8h, v1.8h, v24.8h \n" // G | 1551 "mls v4.8h, v1.8h, v24.8h \n" // G |
1542 "mls v4.8h, v0.8h, v23.8h \n" // B | 1552 "mls v4.8h, v0.8h, v23.8h \n" // B |
1543 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned | 1553 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned |
1544 | 1554 |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1580 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. | 1590 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. |
1581 | 1591 |
1582 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. | 1592 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. |
1583 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. | 1593 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. |
1584 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. | 1594 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. |
1585 | 1595 |
1586 "urshr v0.8h, v0.8h, #1 \n" // 2x average | 1596 "urshr v0.8h, v0.8h, #1 \n" // 2x average |
1587 "urshr v1.8h, v1.8h, #1 \n" | 1597 "urshr v1.8h, v1.8h, #1 \n" |
1588 "urshr v2.8h, v2.8h, #1 \n" | 1598 "urshr v2.8h, v2.8h, #1 \n" |
1589 | 1599 |
1590 "subs %3, %3, #32 \n" // 32 processed per loop. | 1600 "subs %w3, %w3, #32 \n" // 32 processed per loop. |
1591 "mul v3.8h, v0.8h, v20.8h \n" // B | 1601 "mul v3.8h, v0.8h, v20.8h \n" // B |
1592 "mls v3.8h, v1.8h, v21.8h \n" // G | 1602 "mls v3.8h, v1.8h, v21.8h \n" // G |
1593 "mls v3.8h, v2.8h, v22.8h \n" // R | 1603 "mls v3.8h, v2.8h, v22.8h \n" // R |
1594 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned | 1604 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned |
1595 "mul v4.8h, v2.8h, v20.8h \n" // R | 1605 "mul v4.8h, v2.8h, v20.8h \n" // R |
1596 "mls v4.8h, v1.8h, v24.8h \n" // G | 1606 "mls v4.8h, v1.8h, v24.8h \n" // G |
1597 "mls v4.8h, v0.8h, v23.8h \n" // B | 1607 "mls v4.8h, v0.8h, v23.8h \n" // B |
1598 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned | 1608 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned |
1599 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U | 1609 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U |
1600 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V | 1610 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1646 MEMACCESS(1) | 1656 MEMACCESS(1) |
1647 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 | 1657 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 |
1648 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. | 1658 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. |
1649 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. | 1659 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. |
1650 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. | 1660 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. |
1651 | 1661 |
1652 "urshr v0.8h, v0.8h, #1 \n" // 2x average | 1662 "urshr v0.8h, v0.8h, #1 \n" // 2x average |
1653 "urshr v1.8h, v1.8h, #1 \n" | 1663 "urshr v1.8h, v1.8h, #1 \n" |
1654 "urshr v2.8h, v2.8h, #1 \n" | 1664 "urshr v2.8h, v2.8h, #1 \n" |
1655 | 1665 |
1656 "subs %4, %4, #16 \n" // 32 processed per loop. | 1666 "subs %w4, %w4, #16 \n" // 32 processed per loop. |
1657 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1667 RGBTOUV(v0.8h, v1.8h, v2.8h) |
1658 MEMACCESS(2) | 1668 MEMACCESS(2) |
1659 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1669 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1660 MEMACCESS(3) | 1670 MEMACCESS(3) |
1661 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1671 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1662 "b.gt 1b \n" | 1672 "b.gt 1b \n" |
1663 : "+r"(src_argb), // %0 | 1673 : "+r"(src_argb), // %0 |
1664 "+r"(src_argb_1), // %1 | 1674 "+r"(src_argb_1), // %1 |
1665 "+r"(dst_u), // %2 | 1675 "+r"(dst_u), // %2 |
1666 "+r"(dst_v), // %3 | 1676 "+r"(dst_v), // %3 |
(...skipping 26 matching lines...) Expand all Loading... |
1693 MEMACCESS(1) | 1703 MEMACCESS(1) |
1694 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 | 1704 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 |
1695 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. | 1705 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. |
1696 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. | 1706 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. |
1697 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. | 1707 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. |
1698 | 1708 |
1699 "urshr v0.8h, v0.8h, #1 \n" // 2x average | 1709 "urshr v0.8h, v0.8h, #1 \n" // 2x average |
1700 "urshr v1.8h, v1.8h, #1 \n" | 1710 "urshr v1.8h, v1.8h, #1 \n" |
1701 "urshr v2.8h, v2.8h, #1 \n" | 1711 "urshr v2.8h, v2.8h, #1 \n" |
1702 | 1712 |
1703 "subs %4, %4, #16 \n" // 32 processed per loop. | 1713 "subs %w4, %w4, #16 \n" // 32 processed per loop. |
1704 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1714 RGBTOUV(v0.8h, v1.8h, v2.8h) |
1705 MEMACCESS(2) | 1715 MEMACCESS(2) |
1706 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1716 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1707 MEMACCESS(3) | 1717 MEMACCESS(3) |
1708 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1718 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1709 "b.gt 1b \n" | 1719 "b.gt 1b \n" |
1710 : "+r"(src_argb), // %0 | 1720 : "+r"(src_argb), // %0 |
1711 "+r"(src_argb_1), // %1 | 1721 "+r"(src_argb_1), // %1 |
1712 "+r"(dst_u), // %2 | 1722 "+r"(dst_u), // %2 |
1713 "+r"(dst_v), // %3 | 1723 "+r"(dst_v), // %3 |
(...skipping 20 matching lines...) Expand all Loading... |
1734 MEMACCESS(1) | 1744 MEMACCESS(1) |
1735 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more | 1745 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more |
1736 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. | 1746 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. |
1737 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. | 1747 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. |
1738 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. | 1748 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. |
1739 | 1749 |
1740 "urshr v0.8h, v0.8h, #1 \n" // 2x average | 1750 "urshr v0.8h, v0.8h, #1 \n" // 2x average |
1741 "urshr v1.8h, v3.8h, #1 \n" | 1751 "urshr v1.8h, v3.8h, #1 \n" |
1742 "urshr v2.8h, v2.8h, #1 \n" | 1752 "urshr v2.8h, v2.8h, #1 \n" |
1743 | 1753 |
1744 "subs %4, %4, #16 \n" // 32 processed per loop. | 1754 "subs %w4, %w4, #16 \n" // 32 processed per loop. |
1745 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1755 RGBTOUV(v0.8h, v1.8h, v2.8h) |
1746 MEMACCESS(2) | 1756 MEMACCESS(2) |
1747 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1757 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1748 MEMACCESS(3) | 1758 MEMACCESS(3) |
1749 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1759 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1750 "b.gt 1b \n" | 1760 "b.gt 1b \n" |
1751 : "+r"(src_bgra), // %0 | 1761 : "+r"(src_bgra), // %0 |
1752 "+r"(src_bgra_1), // %1 | 1762 "+r"(src_bgra_1), // %1 |
1753 "+r"(dst_u), // %2 | 1763 "+r"(dst_u), // %2 |
1754 "+r"(dst_v), // %3 | 1764 "+r"(dst_v), // %3 |
(...skipping 20 matching lines...) Expand all Loading... |
1775 MEMACCESS(1) | 1785 MEMACCESS(1) |
1776 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. | 1786 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. |
1777 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. | 1787 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. |
1778 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. | 1788 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. |
1779 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. | 1789 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. |
1780 | 1790 |
1781 "urshr v0.8h, v3.8h, #1 \n" // 2x average | 1791 "urshr v0.8h, v3.8h, #1 \n" // 2x average |
1782 "urshr v2.8h, v2.8h, #1 \n" | 1792 "urshr v2.8h, v2.8h, #1 \n" |
1783 "urshr v1.8h, v1.8h, #1 \n" | 1793 "urshr v1.8h, v1.8h, #1 \n" |
1784 | 1794 |
1785 "subs %4, %4, #16 \n" // 32 processed per loop. | 1795 "subs %w4, %w4, #16 \n" // 32 processed per loop. |
1786 RGBTOUV(v0.8h, v2.8h, v1.8h) | 1796 RGBTOUV(v0.8h, v2.8h, v1.8h) |
1787 MEMACCESS(2) | 1797 MEMACCESS(2) |
1788 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1798 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1789 MEMACCESS(3) | 1799 MEMACCESS(3) |
1790 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1800 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1791 "b.gt 1b \n" | 1801 "b.gt 1b \n" |
1792 : "+r"(src_abgr), // %0 | 1802 : "+r"(src_abgr), // %0 |
1793 "+r"(src_abgr_1), // %1 | 1803 "+r"(src_abgr_1), // %1 |
1794 "+r"(dst_u), // %2 | 1804 "+r"(dst_u), // %2 |
1795 "+r"(dst_v), // %3 | 1805 "+r"(dst_v), // %3 |
(...skipping 20 matching lines...) Expand all Loading... |
1816 MEMACCESS(1) | 1826 MEMACCESS(1) |
1817 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. | 1827 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. |
1818 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. | 1828 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. |
1819 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. | 1829 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. |
1820 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. | 1830 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. |
1821 | 1831 |
1822 "urshr v0.8h, v0.8h, #1 \n" // 2x average | 1832 "urshr v0.8h, v0.8h, #1 \n" // 2x average |
1823 "urshr v1.8h, v1.8h, #1 \n" | 1833 "urshr v1.8h, v1.8h, #1 \n" |
1824 "urshr v2.8h, v2.8h, #1 \n" | 1834 "urshr v2.8h, v2.8h, #1 \n" |
1825 | 1835 |
1826 "subs %4, %4, #16 \n" // 32 processed per loop. | 1836 "subs %w4, %w4, #16 \n" // 32 processed per loop. |
1827 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1837 RGBTOUV(v0.8h, v1.8h, v2.8h) |
1828 MEMACCESS(2) | 1838 MEMACCESS(2) |
1829 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1839 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1830 MEMACCESS(3) | 1840 MEMACCESS(3) |
1831 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1841 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1832 "b.gt 1b \n" | 1842 "b.gt 1b \n" |
1833 : "+r"(src_rgba), // %0 | 1843 : "+r"(src_rgba), // %0 |
1834 "+r"(src_rgba_1), // %1 | 1844 "+r"(src_rgba_1), // %1 |
1835 "+r"(dst_u), // %2 | 1845 "+r"(dst_u), // %2 |
1836 "+r"(dst_v), // %3 | 1846 "+r"(dst_v), // %3 |
(...skipping 20 matching lines...) Expand all Loading... |
1857 MEMACCESS(1) | 1867 MEMACCESS(1) |
1858 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. | 1868 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. |
1859 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. | 1869 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. |
1860 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. | 1870 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. |
1861 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. | 1871 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. |
1862 | 1872 |
1863 "urshr v0.8h, v0.8h, #1 \n" // 2x average | 1873 "urshr v0.8h, v0.8h, #1 \n" // 2x average |
1864 "urshr v1.8h, v1.8h, #1 \n" | 1874 "urshr v1.8h, v1.8h, #1 \n" |
1865 "urshr v2.8h, v2.8h, #1 \n" | 1875 "urshr v2.8h, v2.8h, #1 \n" |
1866 | 1876 |
1867 "subs %4, %4, #16 \n" // 32 processed per loop. | 1877 "subs %w4, %w4, #16 \n" // 32 processed per loop. |
1868 RGBTOUV(v0.8h, v1.8h, v2.8h) | 1878 RGBTOUV(v0.8h, v1.8h, v2.8h) |
1869 MEMACCESS(2) | 1879 MEMACCESS(2) |
1870 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1880 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1871 MEMACCESS(3) | 1881 MEMACCESS(3) |
1872 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1882 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1873 "b.gt 1b \n" | 1883 "b.gt 1b \n" |
1874 : "+r"(src_rgb24), // %0 | 1884 : "+r"(src_rgb24), // %0 |
1875 "+r"(src_rgb24_1), // %1 | 1885 "+r"(src_rgb24_1), // %1 |
1876 "+r"(dst_u), // %2 | 1886 "+r"(dst_u), // %2 |
1877 "+r"(dst_v), // %3 | 1887 "+r"(dst_v), // %3 |
(...skipping 20 matching lines...) Expand all Loading... |
1898 MEMACCESS(1) | 1908 MEMACCESS(1) |
1899 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels | 1909 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels |
1900 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. | 1910 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. |
1901 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. | 1911 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. |
1902 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. | 1912 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. |
1903 | 1913 |
1904 "urshr v2.8h, v2.8h, #1 \n" // 2x average | 1914 "urshr v2.8h, v2.8h, #1 \n" // 2x average |
1905 "urshr v1.8h, v1.8h, #1 \n" | 1915 "urshr v1.8h, v1.8h, #1 \n" |
1906 "urshr v0.8h, v0.8h, #1 \n" | 1916 "urshr v0.8h, v0.8h, #1 \n" |
1907 | 1917 |
1908 "subs %4, %4, #16 \n" // 32 processed per loop. | 1918 "subs %w4, %w4, #16 \n" // 32 processed per loop. |
1909 RGBTOUV(v2.8h, v1.8h, v0.8h) | 1919 RGBTOUV(v2.8h, v1.8h, v0.8h) |
1910 MEMACCESS(2) | 1920 MEMACCESS(2) |
1911 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. | 1921 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. |
1912 MEMACCESS(3) | 1922 MEMACCESS(3) |
1913 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. | 1923 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. |
1914 "b.gt 1b \n" | 1924 "b.gt 1b \n" |
1915 : "+r"(src_raw), // %0 | 1925 : "+r"(src_raw), // %0 |
1916 "+r"(src_raw_1), // %1 | 1926 "+r"(src_raw_1), // %1 |
1917 "+r"(dst_u), // %2 | 1927 "+r"(dst_u), // %2 |
1918 "+r"(dst_v), // %3 | 1928 "+r"(dst_v), // %3 |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1964 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. | 1974 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. |
1965 | 1975 |
1966 "ins v16.D[1], v17.D[0] \n" | 1976 "ins v16.D[1], v17.D[0] \n" |
1967 "ins v18.D[1], v19.D[0] \n" | 1977 "ins v18.D[1], v19.D[0] \n" |
1968 "ins v20.D[1], v21.D[0] \n" | 1978 "ins v20.D[1], v21.D[0] \n" |
1969 | 1979 |
1970 "urshr v4.8h, v16.8h, #1 \n" // 2x average | 1980 "urshr v4.8h, v16.8h, #1 \n" // 2x average |
1971 "urshr v5.8h, v18.8h, #1 \n" | 1981 "urshr v5.8h, v18.8h, #1 \n" |
1972 "urshr v6.8h, v20.8h, #1 \n" | 1982 "urshr v6.8h, v20.8h, #1 \n" |
1973 | 1983 |
1974 "subs %4, %4, #16 \n" // 16 processed per loop. | 1984 "subs %w4, %w4, #16 \n" // 16 processed per loop. |
1975 "mul v16.8h, v4.8h, v22.8h \n" // B | 1985 "mul v16.8h, v4.8h, v22.8h \n" // B |
1976 "mls v16.8h, v5.8h, v23.8h \n" // G | 1986 "mls v16.8h, v5.8h, v23.8h \n" // G |
1977 "mls v16.8h, v6.8h, v24.8h \n" // R | 1987 "mls v16.8h, v6.8h, v24.8h \n" // R |
1978 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned | 1988 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned |
1979 "mul v17.8h, v6.8h, v22.8h \n" // R | 1989 "mul v17.8h, v6.8h, v22.8h \n" // R |
1980 "mls v17.8h, v5.8h, v26.8h \n" // G | 1990 "mls v17.8h, v5.8h, v26.8h \n" // G |
1981 "mls v17.8h, v4.8h, v25.8h \n" // B | 1991 "mls v17.8h, v4.8h, v25.8h \n" // B |
1982 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned | 1992 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned |
1983 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U | 1993 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U |
1984 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V | 1994 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2035 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. | 2045 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. |
2036 | 2046 |
2037 "ins v16.D[1], v26.D[0] \n" | 2047 "ins v16.D[1], v26.D[0] \n" |
2038 "ins v17.D[1], v27.D[0] \n" | 2048 "ins v17.D[1], v27.D[0] \n" |
2039 "ins v18.D[1], v28.D[0] \n" | 2049 "ins v18.D[1], v28.D[0] \n" |
2040 | 2050 |
2041 "urshr v4.8h, v16.8h, #1 \n" // 2x average | 2051 "urshr v4.8h, v16.8h, #1 \n" // 2x average |
2042 "urshr v5.8h, v17.8h, #1 \n" | 2052 "urshr v5.8h, v17.8h, #1 \n" |
2043 "urshr v6.8h, v18.8h, #1 \n" | 2053 "urshr v6.8h, v18.8h, #1 \n" |
2044 | 2054 |
2045 "subs %4, %4, #16 \n" // 16 processed per loop. | 2055 "subs %w4, %w4, #16 \n" // 16 processed per loop. |
2046 "mul v2.8h, v4.8h, v20.8h \n" // B | 2056 "mul v2.8h, v4.8h, v20.8h \n" // B |
2047 "mls v2.8h, v5.8h, v21.8h \n" // G | 2057 "mls v2.8h, v5.8h, v21.8h \n" // G |
2048 "mls v2.8h, v6.8h, v22.8h \n" // R | 2058 "mls v2.8h, v6.8h, v22.8h \n" // R |
2049 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned | 2059 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned |
2050 "mul v3.8h, v6.8h, v20.8h \n" // R | 2060 "mul v3.8h, v6.8h, v20.8h \n" // R |
2051 "mls v3.8h, v5.8h, v24.8h \n" // G | 2061 "mls v3.8h, v5.8h, v24.8h \n" // G |
2052 "mls v3.8h, v4.8h, v23.8h \n" // B | 2062 "mls v3.8h, v4.8h, v23.8h \n" // B |
2053 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned | 2063 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned |
2054 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U | 2064 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U |
2055 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V | 2065 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2106 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. | 2116 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. |
2107 | 2117 |
2108 "ins v16.D[1], v26.D[0] \n" | 2118 "ins v16.D[1], v26.D[0] \n" |
2109 "ins v17.D[1], v27.D[0] \n" | 2119 "ins v17.D[1], v27.D[0] \n" |
2110 "ins v18.D[1], v28.D[0] \n" | 2120 "ins v18.D[1], v28.D[0] \n" |
2111 | 2121 |
2112 "urshr v4.8h, v16.8h, #1 \n" // 2x average | 2122 "urshr v4.8h, v16.8h, #1 \n" // 2x average |
2113 "urshr v5.8h, v17.8h, #1 \n" | 2123 "urshr v5.8h, v17.8h, #1 \n" |
2114 "urshr v6.8h, v18.8h, #1 \n" | 2124 "urshr v6.8h, v18.8h, #1 \n" |
2115 | 2125 |
2116 "subs %4, %4, #16 \n" // 16 processed per loop. | 2126 "subs %w4, %w4, #16 \n" // 16 processed per loop. |
2117 "mul v2.8h, v4.8h, v20.8h \n" // B | 2127 "mul v2.8h, v4.8h, v20.8h \n" // B |
2118 "mls v2.8h, v5.8h, v21.8h \n" // G | 2128 "mls v2.8h, v5.8h, v21.8h \n" // G |
2119 "mls v2.8h, v6.8h, v22.8h \n" // R | 2129 "mls v2.8h, v6.8h, v22.8h \n" // R |
2120 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned | 2130 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned |
2121 "mul v3.8h, v6.8h, v20.8h \n" // R | 2131 "mul v3.8h, v6.8h, v20.8h \n" // R |
2122 "mls v3.8h, v5.8h, v24.8h \n" // G | 2132 "mls v3.8h, v5.8h, v24.8h \n" // G |
2123 "mls v3.8h, v4.8h, v23.8h \n" // B | 2133 "mls v3.8h, v4.8h, v23.8h \n" // B |
2124 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned | 2134 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned |
2125 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U | 2135 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U |
2126 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V | 2136 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V |
(...skipping 19 matching lines...) Expand all Loading... |
2146 #ifdef HAS_RGB565TOYROW_NEON | 2156 #ifdef HAS_RGB565TOYROW_NEON |
2147 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { | 2157 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { |
2148 asm volatile ( | 2158 asm volatile ( |
2149 "movi v24.8b, #13 \n" // B * 0.1016 coefficient | 2159 "movi v24.8b, #13 \n" // B * 0.1016 coefficient |
2150 "movi v25.8b, #65 \n" // G * 0.5078 coefficient | 2160 "movi v25.8b, #65 \n" // G * 0.5078 coefficient |
2151 "movi v26.8b, #33 \n" // R * 0.2578 coefficient | 2161 "movi v26.8b, #33 \n" // R * 0.2578 coefficient |
2152 "movi v27.8b, #16 \n" // Add 16 constant | 2162 "movi v27.8b, #16 \n" // Add 16 constant |
2153 "1: \n" | 2163 "1: \n" |
2154 MEMACCESS(0) | 2164 MEMACCESS(0) |
2155 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. | 2165 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. |
2156 "subs %2, %2, #8 \n" // 8 processed per loop. | 2166 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2157 RGB565TOARGB | 2167 RGB565TOARGB |
2158 "umull v3.8h, v0.8b, v24.8b \n" // B | 2168 "umull v3.8h, v0.8b, v24.8b \n" // B |
2159 "umlal v3.8h, v1.8b, v25.8b \n" // G | 2169 "umlal v3.8h, v1.8b, v25.8b \n" // G |
2160 "umlal v3.8h, v2.8b, v26.8b \n" // R | 2170 "umlal v3.8h, v2.8b, v26.8b \n" // R |
2161 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2171 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
2162 "uqadd v0.8b, v0.8b, v27.8b \n" | 2172 "uqadd v0.8b, v0.8b, v27.8b \n" |
2163 MEMACCESS(1) | 2173 MEMACCESS(1) |
2164 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2174 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2165 "b.gt 1b \n" | 2175 "b.gt 1b \n" |
2166 : "+r"(src_rgb565), // %0 | 2176 : "+r"(src_rgb565), // %0 |
2167 "+r"(dst_y), // %1 | 2177 "+r"(dst_y), // %1 |
2168 "+r"(pix) // %2 | 2178 "+r"(pix) // %2 |
2169 : | 2179 : |
2170 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", | 2180 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", |
2171 "v24", "v25", "v26", "v27" | 2181 "v24", "v25", "v26", "v27" |
2172 ); | 2182 ); |
2173 } | 2183 } |
2174 #endif // HAS_RGB565TOYROW_NEON | 2184 #endif // HAS_RGB565TOYROW_NEON |
2175 | 2185 |
2176 #ifdef HAS_ARGB1555TOYROW_NEON | 2186 #ifdef HAS_ARGB1555TOYROW_NEON |
2177 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { | 2187 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { |
2178 asm volatile ( | 2188 asm volatile ( |
2179 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2189 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
2180 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2190 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2181 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2191 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
2182 "movi v7.8b, #16 \n" // Add 16 constant | 2192 "movi v7.8b, #16 \n" // Add 16 constant |
2183 "1: \n" | 2193 "1: \n" |
2184 MEMACCESS(0) | 2194 MEMACCESS(0) |
2185 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 2195 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
2186 "subs %2, %2, #8 \n" // 8 processed per loop. | 2196 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2187 ARGB1555TOARGB | 2197 ARGB1555TOARGB |
2188 "umull v3.8h, v0.8b, v4.8b \n" // B | 2198 "umull v3.8h, v0.8b, v4.8b \n" // B |
2189 "umlal v3.8h, v1.8b, v5.8b \n" // G | 2199 "umlal v3.8h, v1.8b, v5.8b \n" // G |
2190 "umlal v3.8h, v2.8b, v6.8b \n" // R | 2200 "umlal v3.8h, v2.8b, v6.8b \n" // R |
2191 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2201 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
2192 "uqadd v0.8b, v0.8b, v7.8b \n" | 2202 "uqadd v0.8b, v0.8b, v7.8b \n" |
2193 MEMACCESS(1) | 2203 MEMACCESS(1) |
2194 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2204 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2195 "b.gt 1b \n" | 2205 "b.gt 1b \n" |
2196 : "+r"(src_argb1555), // %0 | 2206 : "+r"(src_argb1555), // %0 |
2197 "+r"(dst_y), // %1 | 2207 "+r"(dst_y), // %1 |
2198 "+r"(pix) // %2 | 2208 "+r"(pix) // %2 |
2199 : | 2209 : |
2200 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2210 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
2201 ); | 2211 ); |
2202 } | 2212 } |
2203 #endif // HAS_ARGB1555TOYROW_NEON | 2213 #endif // HAS_ARGB1555TOYROW_NEON |
2204 | 2214 |
2205 #ifdef HAS_ARGB4444TOYROW_NEON | 2215 #ifdef HAS_ARGB4444TOYROW_NEON |
2206 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { | 2216 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { |
2207 asm volatile ( | 2217 asm volatile ( |
2208 "movi v24.8b, #13 \n" // B * 0.1016 coefficient | 2218 "movi v24.8b, #13 \n" // B * 0.1016 coefficient |
2209 "movi v25.8b, #65 \n" // G * 0.5078 coefficient | 2219 "movi v25.8b, #65 \n" // G * 0.5078 coefficient |
2210 "movi v26.8b, #33 \n" // R * 0.2578 coefficient | 2220 "movi v26.8b, #33 \n" // R * 0.2578 coefficient |
2211 "movi v27.8b, #16 \n" // Add 16 constant | 2221 "movi v27.8b, #16 \n" // Add 16 constant |
2212 "1: \n" | 2222 "1: \n" |
2213 MEMACCESS(0) | 2223 MEMACCESS(0) |
2214 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 2224 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
2215 "subs %2, %2, #8 \n" // 8 processed per loop. | 2225 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2216 ARGB4444TOARGB | 2226 ARGB4444TOARGB |
2217 "umull v3.8h, v0.8b, v24.8b \n" // B | 2227 "umull v3.8h, v0.8b, v24.8b \n" // B |
2218 "umlal v3.8h, v1.8b, v25.8b \n" // G | 2228 "umlal v3.8h, v1.8b, v25.8b \n" // G |
2219 "umlal v3.8h, v2.8b, v26.8b \n" // R | 2229 "umlal v3.8h, v2.8b, v26.8b \n" // R |
2220 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2230 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
2221 "uqadd v0.8b, v0.8b, v27.8b \n" | 2231 "uqadd v0.8b, v0.8b, v27.8b \n" |
2222 MEMACCESS(1) | 2232 MEMACCESS(1) |
2223 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2233 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2224 "b.gt 1b \n" | 2234 "b.gt 1b \n" |
2225 : "+r"(src_argb4444), // %0 | 2235 : "+r"(src_argb4444), // %0 |
2226 "+r"(dst_y), // %1 | 2236 "+r"(dst_y), // %1 |
2227 "+r"(pix) // %2 | 2237 "+r"(pix) // %2 |
2228 : | 2238 : |
2229 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" | 2239 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" |
2230 ); | 2240 ); |
2231 } | 2241 } |
2232 #endif // HAS_ARGB4444TOYROW_NEON | 2242 #endif // HAS_ARGB4444TOYROW_NEON |
2233 | 2243 |
2234 #ifdef HAS_BGRATOYROW_NEON | 2244 #ifdef HAS_BGRATOYROW_NEON |
2235 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { | 2245 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { |
2236 asm volatile ( | 2246 asm volatile ( |
2237 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2247 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
2238 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2248 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2239 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2249 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
2240 "movi v7.8b, #16 \n" // Add 16 constant | 2250 "movi v7.8b, #16 \n" // Add 16 constant |
2241 "1: \n" | 2251 "1: \n" |
2242 MEMACCESS(0) | 2252 MEMACCESS(0) |
2243 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2253 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
2244 "subs %2, %2, #8 \n" // 8 processed per loop. | 2254 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2245 "umull v16.8h, v1.8b, v4.8b \n" // R | 2255 "umull v16.8h, v1.8b, v4.8b \n" // R |
2246 "umlal v16.8h, v2.8b, v5.8b \n" // G | 2256 "umlal v16.8h, v2.8b, v5.8b \n" // G |
2247 "umlal v16.8h, v3.8b, v6.8b \n" // B | 2257 "umlal v16.8h, v3.8b, v6.8b \n" // B |
2248 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2258 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2249 "uqadd v0.8b, v0.8b, v7.8b \n" | 2259 "uqadd v0.8b, v0.8b, v7.8b \n" |
2250 MEMACCESS(1) | 2260 MEMACCESS(1) |
2251 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2261 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2252 "b.gt 1b \n" | 2262 "b.gt 1b \n" |
2253 : "+r"(src_bgra), // %0 | 2263 : "+r"(src_bgra), // %0 |
2254 "+r"(dst_y), // %1 | 2264 "+r"(dst_y), // %1 |
2255 "+r"(pix) // %2 | 2265 "+r"(pix) // %2 |
2256 : | 2266 : |
2257 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2267 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2258 ); | 2268 ); |
2259 } | 2269 } |
2260 #endif // HAS_BGRATOYROW_NEON | 2270 #endif // HAS_BGRATOYROW_NEON |
2261 | 2271 |
2262 #ifdef HAS_ABGRTOYROW_NEON | 2272 #ifdef HAS_ABGRTOYROW_NEON |
2263 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { | 2273 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { |
2264 asm volatile ( | 2274 asm volatile ( |
2265 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2275 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
2266 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2276 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2267 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2277 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
2268 "movi v7.8b, #16 \n" // Add 16 constant | 2278 "movi v7.8b, #16 \n" // Add 16 constant |
2269 "1: \n" | 2279 "1: \n" |
2270 MEMACCESS(0) | 2280 MEMACCESS(0) |
2271 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2281 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
2272 "subs %2, %2, #8 \n" // 8 processed per loop. | 2282 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2273 "umull v16.8h, v0.8b, v4.8b \n" // R | 2283 "umull v16.8h, v0.8b, v4.8b \n" // R |
2274 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2284 "umlal v16.8h, v1.8b, v5.8b \n" // G |
2275 "umlal v16.8h, v2.8b, v6.8b \n" // B | 2285 "umlal v16.8h, v2.8b, v6.8b \n" // B |
2276 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2286 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2277 "uqadd v0.8b, v0.8b, v7.8b \n" | 2287 "uqadd v0.8b, v0.8b, v7.8b \n" |
2278 MEMACCESS(1) | 2288 MEMACCESS(1) |
2279 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2289 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2280 "b.gt 1b \n" | 2290 "b.gt 1b \n" |
2281 : "+r"(src_abgr), // %0 | 2291 : "+r"(src_abgr), // %0 |
2282 "+r"(dst_y), // %1 | 2292 "+r"(dst_y), // %1 |
2283 "+r"(pix) // %2 | 2293 "+r"(pix) // %2 |
2284 : | 2294 : |
2285 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2295 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2286 ); | 2296 ); |
2287 } | 2297 } |
2288 #endif // HAS_ABGRTOYROW_NEON | 2298 #endif // HAS_ABGRTOYROW_NEON |
2289 | 2299 |
2290 #ifdef HAS_RGBATOYROW_NEON | 2300 #ifdef HAS_RGBATOYROW_NEON |
2291 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { | 2301 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { |
2292 asm volatile ( | 2302 asm volatile ( |
2293 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2303 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
2294 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2304 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2295 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2305 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
2296 "movi v7.8b, #16 \n" // Add 16 constant | 2306 "movi v7.8b, #16 \n" // Add 16 constant |
2297 "1: \n" | 2307 "1: \n" |
2298 MEMACCESS(0) | 2308 MEMACCESS(0) |
2299 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2309 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
2300 "subs %2, %2, #8 \n" // 8 processed per loop. | 2310 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2301 "umull v16.8h, v1.8b, v4.8b \n" // B | 2311 "umull v16.8h, v1.8b, v4.8b \n" // B |
2302 "umlal v16.8h, v2.8b, v5.8b \n" // G | 2312 "umlal v16.8h, v2.8b, v5.8b \n" // G |
2303 "umlal v16.8h, v3.8b, v6.8b \n" // R | 2313 "umlal v16.8h, v3.8b, v6.8b \n" // R |
2304 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2314 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2305 "uqadd v0.8b, v0.8b, v7.8b \n" | 2315 "uqadd v0.8b, v0.8b, v7.8b \n" |
2306 MEMACCESS(1) | 2316 MEMACCESS(1) |
2307 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2317 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2308 "b.gt 1b \n" | 2318 "b.gt 1b \n" |
2309 : "+r"(src_rgba), // %0 | 2319 : "+r"(src_rgba), // %0 |
2310 "+r"(dst_y), // %1 | 2320 "+r"(dst_y), // %1 |
2311 "+r"(pix) // %2 | 2321 "+r"(pix) // %2 |
2312 : | 2322 : |
2313 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2323 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2314 ); | 2324 ); |
2315 } | 2325 } |
2316 #endif // HAS_RGBATOYROW_NEON | 2326 #endif // HAS_RGBATOYROW_NEON |
2317 | 2327 |
2318 #ifdef HAS_RGB24TOYROW_NEON | 2328 #ifdef HAS_RGB24TOYROW_NEON |
2319 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { | 2329 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { |
2320 asm volatile ( | 2330 asm volatile ( |
2321 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2331 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
2322 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2332 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2323 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2333 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
2324 "movi v7.8b, #16 \n" // Add 16 constant | 2334 "movi v7.8b, #16 \n" // Add 16 constant |
2325 "1: \n" | 2335 "1: \n" |
2326 MEMACCESS(0) | 2336 MEMACCESS(0) |
2327 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. | 2337 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. |
2328 "subs %2, %2, #8 \n" // 8 processed per loop. | 2338 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2329 "umull v16.8h, v0.8b, v4.8b \n" // B | 2339 "umull v16.8h, v0.8b, v4.8b \n" // B |
2330 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2340 "umlal v16.8h, v1.8b, v5.8b \n" // G |
2331 "umlal v16.8h, v2.8b, v6.8b \n" // R | 2341 "umlal v16.8h, v2.8b, v6.8b \n" // R |
2332 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2342 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2333 "uqadd v0.8b, v0.8b, v7.8b \n" | 2343 "uqadd v0.8b, v0.8b, v7.8b \n" |
2334 MEMACCESS(1) | 2344 MEMACCESS(1) |
2335 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2345 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2336 "b.gt 1b \n" | 2346 "b.gt 1b \n" |
2337 : "+r"(src_rgb24), // %0 | 2347 : "+r"(src_rgb24), // %0 |
2338 "+r"(dst_y), // %1 | 2348 "+r"(dst_y), // %1 |
2339 "+r"(pix) // %2 | 2349 "+r"(pix) // %2 |
2340 : | 2350 : |
2341 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2351 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2342 ); | 2352 ); |
2343 } | 2353 } |
2344 #endif // HAS_RGB24TOYROW_NEON | 2354 #endif // HAS_RGB24TOYROW_NEON |
2345 | 2355 |
2346 #ifdef HAS_RAWTOYROW_NEON | 2356 #ifdef HAS_RAWTOYROW_NEON |
2347 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { | 2357 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { |
2348 asm volatile ( | 2358 asm volatile ( |
2349 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2359 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
2350 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2360 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
2351 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2361 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
2352 "movi v7.8b, #16 \n" // Add 16 constant | 2362 "movi v7.8b, #16 \n" // Add 16 constant |
2353 "1: \n" | 2363 "1: \n" |
2354 MEMACCESS(0) | 2364 MEMACCESS(0) |
2355 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. | 2365 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. |
2356 "subs %2, %2, #8 \n" // 8 processed per loop. | 2366 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2357 "umull v16.8h, v0.8b, v4.8b \n" // B | 2367 "umull v16.8h, v0.8b, v4.8b \n" // B |
2358 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2368 "umlal v16.8h, v1.8b, v5.8b \n" // G |
2359 "umlal v16.8h, v2.8b, v6.8b \n" // R | 2369 "umlal v16.8h, v2.8b, v6.8b \n" // R |
2360 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2370 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
2361 "uqadd v0.8b, v0.8b, v7.8b \n" | 2371 "uqadd v0.8b, v0.8b, v7.8b \n" |
2362 MEMACCESS(1) | 2372 MEMACCESS(1) |
2363 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2373 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
2364 "b.gt 1b \n" | 2374 "b.gt 1b \n" |
2365 : "+r"(src_raw), // %0 | 2375 : "+r"(src_raw), // %0 |
2366 "+r"(dst_y), // %1 | 2376 "+r"(dst_y), // %1 |
2367 "+r"(pix) // %2 | 2377 "+r"(pix) // %2 |
2368 : | 2378 : |
2369 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2379 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
2370 ); | 2380 ); |
2371 } | 2381 } |
2372 #endif // HAS_RAWTOYROW_NEON | 2382 #endif // HAS_RAWTOYROW_NEON |
2373 | 2383 |
2374 // Bilinear filter 16x2 -> 16x1 | 2384 // Bilinear filter 16x2 -> 16x1 |
2375 #ifdef HAS_INTERPOLATEROW_NEON | 2385 #ifdef HAS_INTERPOLATEROW_NEON |
2376 void InterpolateRow_NEON(uint8* dst_ptr, | 2386 void InterpolateRow_NEON(uint8* dst_ptr, |
2377 const uint8* src_ptr, ptrdiff_t src_stride, | 2387 const uint8* src_ptr, ptrdiff_t src_stride, |
2378 int dst_width, int source_y_fraction) { | 2388 int dst_width, int source_y_fraction) { |
2379 int y1_fraction = source_y_fraction; | 2389 int y1_fraction = source_y_fraction; |
2380 int y0_fraction = 256 - y1_fraction; | 2390 int y0_fraction = 256 - y1_fraction; |
2381 const uint8* src_ptr1 = src_ptr + src_stride; | 2391 const uint8* src_ptr1 = src_ptr + src_stride; |
2382 asm volatile ( | 2392 asm volatile ( |
2383 "cmp %4, #0 \n" | 2393 "cmp %w4, #0 \n" |
2384 "b.eq 100f \n" | 2394 "b.eq 100f \n" |
2385 "cmp %4, #64 \n" | 2395 "cmp %w4, #64 \n" |
2386 "b.eq 75f \n" | 2396 "b.eq 75f \n" |
2387 "cmp %4, #128 \n" | 2397 "cmp %w4, #128 \n" |
2388 "b.eq 50f \n" | 2398 "b.eq 50f \n" |
2389 "cmp %4, #192 \n" | 2399 "cmp %w4, #192 \n" |
2390 "b.eq 25f \n" | 2400 "b.eq 25f \n" |
2391 | 2401 |
2392 "dup v5.16b, %w4 \n" | 2402 "dup v5.16b, %w4 \n" |
2393 "dup v4.16b, %w5 \n" | 2403 "dup v4.16b, %w5 \n" |
2394 // General purpose row blend. | 2404 // General purpose row blend. |
2395 "1: \n" | 2405 "1: \n" |
2396 MEMACCESS(1) | 2406 MEMACCESS(1) |
2397 "ld1 {v0.16b}, [%1], #16 \n" | 2407 "ld1 {v0.16b}, [%1], #16 \n" |
2398 MEMACCESS(2) | 2408 MEMACCESS(2) |
2399 "ld1 {v1.16b}, [%2], #16 \n" | 2409 "ld1 {v1.16b}, [%2], #16 \n" |
2400 "subs %3, %3, #16 \n" | 2410 "subs %w3, %w3, #16 \n" |
2401 "umull v2.8h, v0.8b, v4.8b \n" | 2411 "umull v2.8h, v0.8b, v4.8b \n" |
2402 "umull2 v3.8h, v0.16b, v4.16b \n" | 2412 "umull2 v3.8h, v0.16b, v4.16b \n" |
2403 "umlal v2.8h, v1.8b, v5.8b \n" | 2413 "umlal v2.8h, v1.8b, v5.8b \n" |
2404 "umlal2 v3.8h, v1.16b, v5.16b \n" | 2414 "umlal2 v3.8h, v1.16b, v5.16b \n" |
2405 "rshrn v0.8b, v2.8h, #8 \n" | 2415 "rshrn v0.8b, v2.8h, #8 \n" |
2406 "rshrn2 v0.16b, v3.8h, #8 \n" | 2416 "rshrn2 v0.16b, v3.8h, #8 \n" |
2407 MEMACCESS(0) | 2417 MEMACCESS(0) |
2408 "st1 {v0.16b}, [%0], #16 \n" | 2418 "st1 {v0.16b}, [%0], #16 \n" |
2409 "b.gt 1b \n" | 2419 "b.gt 1b \n" |
2410 "b 99f \n" | 2420 "b 99f \n" |
2411 | 2421 |
2412 // Blend 25 / 75. | 2422 // Blend 25 / 75. |
2413 "25: \n" | 2423 "25: \n" |
2414 MEMACCESS(1) | 2424 MEMACCESS(1) |
2415 "ld1 {v0.16b}, [%1], #16 \n" | 2425 "ld1 {v0.16b}, [%1], #16 \n" |
2416 MEMACCESS(2) | 2426 MEMACCESS(2) |
2417 "ld1 {v1.16b}, [%2], #16 \n" | 2427 "ld1 {v1.16b}, [%2], #16 \n" |
2418 "subs %3, %3, #16 \n" | 2428 "subs %w3, %w3, #16 \n" |
2419 "urhadd v0.16b, v0.16b, v1.16b \n" | 2429 "urhadd v0.16b, v0.16b, v1.16b \n" |
2420 "urhadd v0.16b, v0.16b, v1.16b \n" | 2430 "urhadd v0.16b, v0.16b, v1.16b \n" |
2421 MEMACCESS(0) | 2431 MEMACCESS(0) |
2422 "st1 {v0.16b}, [%0], #16 \n" | 2432 "st1 {v0.16b}, [%0], #16 \n" |
2423 "b.gt 25b \n" | 2433 "b.gt 25b \n" |
2424 "b 99f \n" | 2434 "b 99f \n" |
2425 | 2435 |
2426 // Blend 50 / 50. | 2436 // Blend 50 / 50. |
2427 "50: \n" | 2437 "50: \n" |
2428 MEMACCESS(1) | 2438 MEMACCESS(1) |
2429 "ld1 {v0.16b}, [%1], #16 \n" | 2439 "ld1 {v0.16b}, [%1], #16 \n" |
2430 MEMACCESS(2) | 2440 MEMACCESS(2) |
2431 "ld1 {v1.16b}, [%2], #16 \n" | 2441 "ld1 {v1.16b}, [%2], #16 \n" |
2432 "subs %3, %3, #16 \n" | 2442 "subs %w3, %w3, #16 \n" |
2433 "urhadd v0.16b, v0.16b, v1.16b \n" | 2443 "urhadd v0.16b, v0.16b, v1.16b \n" |
2434 MEMACCESS(0) | 2444 MEMACCESS(0) |
2435 "st1 {v0.16b}, [%0], #16 \n" | 2445 "st1 {v0.16b}, [%0], #16 \n" |
2436 "b.gt 50b \n" | 2446 "b.gt 50b \n" |
2437 "b 99f \n" | 2447 "b 99f \n" |
2438 | 2448 |
2439 // Blend 75 / 25. | 2449 // Blend 75 / 25. |
2440 "75: \n" | 2450 "75: \n" |
2441 MEMACCESS(1) | 2451 MEMACCESS(1) |
2442 "ld1 {v1.16b}, [%1], #16 \n" | 2452 "ld1 {v1.16b}, [%1], #16 \n" |
2443 MEMACCESS(2) | 2453 MEMACCESS(2) |
2444 "ld1 {v0.16b}, [%2], #16 \n" | 2454 "ld1 {v0.16b}, [%2], #16 \n" |
2445 "subs %3, %3, #16 \n" | 2455 "subs %w3, %w3, #16 \n" |
2446 "urhadd v0.16b, v0.16b, v1.16b \n" | 2456 "urhadd v0.16b, v0.16b, v1.16b \n" |
2447 "urhadd v0.16b, v0.16b, v1.16b \n" | 2457 "urhadd v0.16b, v0.16b, v1.16b \n" |
2448 MEMACCESS(0) | 2458 MEMACCESS(0) |
2449 "st1 {v0.16b}, [%0], #16 \n" | 2459 "st1 {v0.16b}, [%0], #16 \n" |
2450 "b.gt 75b \n" | 2460 "b.gt 75b \n" |
2451 "b 99f \n" | 2461 "b 99f \n" |
2452 | 2462 |
2453 // Blend 100 / 0 - Copy row unchanged. | 2463 // Blend 100 / 0 - Copy row unchanged. |
2454 "100: \n" | 2464 "100: \n" |
2455 MEMACCESS(1) | 2465 MEMACCESS(1) |
2456 "ld1 {v0.16b}, [%1], #16 \n" | 2466 "ld1 {v0.16b}, [%1], #16 \n" |
2457 "subs %3, %3, #16 \n" | 2467 "subs %w3, %w3, #16 \n" |
2458 MEMACCESS(0) | 2468 MEMACCESS(0) |
2459 "st1 {v0.16b}, [%0], #16 \n" | 2469 "st1 {v0.16b}, [%0], #16 \n" |
2460 "b.gt 100b \n" | 2470 "b.gt 100b \n" |
2461 | 2471 |
2462 "99: \n" | 2472 "99: \n" |
2463 : "+r"(dst_ptr), // %0 | 2473 : "+r"(dst_ptr), // %0 |
2464 "+r"(src_ptr), // %1 | 2474 "+r"(src_ptr), // %1 |
2465 "+r"(src_ptr1), // %2 | 2475 "+r"(src_ptr1), // %2 |
2466 "+r"(dst_width), // %3 | 2476 "+r"(dst_width), // %3 |
2467 "+r"(y1_fraction), // %4 | 2477 "+r"(y1_fraction), // %4 |
2468 "+r"(y0_fraction) // %5 | 2478 "+r"(y0_fraction) // %5 |
2469 : | 2479 : |
2470 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" | 2480 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" |
2471 ); | 2481 ); |
2472 } | 2482 } |
2473 #endif // HAS_INTERPOLATEROW_NEON | 2483 #endif // HAS_INTERPOLATEROW_NEON |
2474 | 2484 |
2475 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr | 2485 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr |
2476 #ifdef HAS_ARGBBLENDROW_NEON | 2486 #ifdef HAS_ARGBBLENDROW_NEON |
2477 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2487 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
2478 uint8* dst_argb, int width) { | 2488 uint8* dst_argb, int width) { |
2479 asm volatile ( | 2489 asm volatile ( |
2480 "subs %3, %3, #8 \n" | 2490 "subs %w3, %w3, #8 \n" |
2481 "b.lt 89f \n" | 2491 "b.lt 89f \n" |
2482 // Blend 8 pixels. | 2492 // Blend 8 pixels. |
2483 "8: \n" | 2493 "8: \n" |
2484 MEMACCESS(0) | 2494 MEMACCESS(0) |
2485 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels | 2495 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels |
2486 MEMACCESS(1) | 2496 MEMACCESS(1) |
2487 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels | 2497 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels |
2488 "subs %3, %3, #8 \n" // 8 processed per loop. | 2498 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
2489 "umull v16.8h, v4.8b, v3.8b \n" // db * a | 2499 "umull v16.8h, v4.8b, v3.8b \n" // db * a |
2490 "umull v17.8h, v5.8b, v3.8b \n" // dg * a | 2500 "umull v17.8h, v5.8b, v3.8b \n" // dg * a |
2491 "umull v18.8h, v6.8b, v3.8b \n" // dr * a | 2501 "umull v18.8h, v6.8b, v3.8b \n" // dr * a |
2492 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 | 2502 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 |
2493 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 | 2503 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 |
2494 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 | 2504 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 |
2495 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) | 2505 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) |
2496 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) | 2506 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) |
2497 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) | 2507 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) |
2498 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb | 2508 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb |
2499 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg | 2509 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg |
2500 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr | 2510 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr |
2501 "movi v3.8b, #255 \n" // a = 255 | 2511 "movi v3.8b, #255 \n" // a = 255 |
2502 MEMACCESS(2) | 2512 MEMACCESS(2) |
2503 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2513 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
2504 "b.ge 8b \n" | 2514 "b.ge 8b \n" |
2505 | 2515 |
2506 "89: \n" | 2516 "89: \n" |
2507 "adds %3, %3, #8-1 \n" | 2517 "adds %w3, %w3, #8-1 \n" |
2508 "b.lt 99f \n" | 2518 "b.lt 99f \n" |
2509 | 2519 |
2510 // Blend 1 pixels. | 2520 // Blend 1 pixels. |
2511 "1: \n" | 2521 "1: \n" |
2512 MEMACCESS(0) | 2522 MEMACCESS(0) |
2513 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. | 2523 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. |
2514 MEMACCESS(1) | 2524 MEMACCESS(1) |
2515 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. | 2525 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. |
2516 "subs %3, %3, #1 \n" // 1 processed per loop. | 2526 "subs %w3, %w3, #1 \n" // 1 processed per loop. |
2517 "umull v16.8h, v4.8b, v3.8b \n" // db * a | 2527 "umull v16.8h, v4.8b, v3.8b \n" // db * a |
2518 "umull v17.8h, v5.8b, v3.8b \n" // dg * a | 2528 "umull v17.8h, v5.8b, v3.8b \n" // dg * a |
2519 "umull v18.8h, v6.8b, v3.8b \n" // dr * a | 2529 "umull v18.8h, v6.8b, v3.8b \n" // dr * a |
2520 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 | 2530 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 |
2521 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 | 2531 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 |
2522 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 | 2532 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 |
2523 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) | 2533 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) |
2524 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) | 2534 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) |
2525 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) | 2535 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) |
2526 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb | 2536 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb |
(...skipping 18 matching lines...) Expand all Loading... |
2545 #endif // HAS_ARGBBLENDROW_NEON | 2555 #endif // HAS_ARGBBLENDROW_NEON |
2546 | 2556 |
2547 // Attenuate 8 pixels at a time. | 2557 // Attenuate 8 pixels at a time. |
2548 #ifdef HAS_ARGBATTENUATEROW_NEON | 2558 #ifdef HAS_ARGBATTENUATEROW_NEON |
2549 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { | 2559 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { |
2550 asm volatile ( | 2560 asm volatile ( |
2551 // Attenuate 8 pixels. | 2561 // Attenuate 8 pixels. |
2552 "1: \n" | 2562 "1: \n" |
2553 MEMACCESS(0) | 2563 MEMACCESS(0) |
2554 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels | 2564 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels |
2555 "subs %2, %2, #8 \n" // 8 processed per loop. | 2565 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2556 "umull v4.8h, v0.8b, v3.8b \n" // b * a | 2566 "umull v4.8h, v0.8b, v3.8b \n" // b * a |
2557 "umull v5.8h, v1.8b, v3.8b \n" // g * a | 2567 "umull v5.8h, v1.8b, v3.8b \n" // g * a |
2558 "umull v6.8h, v2.8b, v3.8b \n" // r * a | 2568 "umull v6.8h, v2.8b, v3.8b \n" // r * a |
2559 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 | 2569 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 |
2560 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 | 2570 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 |
2561 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 | 2571 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 |
2562 MEMACCESS(1) | 2572 MEMACCESS(1) |
2563 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 2573 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
2564 "b.gt 1b \n" | 2574 "b.gt 1b \n" |
2565 : "+r"(src_argb), // %0 | 2575 : "+r"(src_argb), // %0 |
(...skipping 13 matching lines...) Expand all Loading... |
2579 asm volatile ( | 2589 asm volatile ( |
2580 "dup v4.8h, %w2 \n" | 2590 "dup v4.8h, %w2 \n" |
2581 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 | 2591 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 |
2582 "dup v5.8h, %w3 \n" // interval multiply. | 2592 "dup v5.8h, %w3 \n" // interval multiply. |
2583 "dup v6.8h, %w4 \n" // interval add | 2593 "dup v6.8h, %w4 \n" // interval add |
2584 | 2594 |
2585 // 8 pixel loop. | 2595 // 8 pixel loop. |
2586 "1: \n" | 2596 "1: \n" |
2587 MEMACCESS(0) | 2597 MEMACCESS(0) |
2588 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. | 2598 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. |
2589 "subs %1, %1, #8 \n" // 8 processed per loop. | 2599 "subs %w1, %w1, #8 \n" // 8 processed per loop. |
2590 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) | 2600 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) |
2591 "uxtl v1.8h, v1.8b \n" | 2601 "uxtl v1.8h, v1.8b \n" |
2592 "uxtl v2.8h, v2.8b \n" | 2602 "uxtl v2.8h, v2.8b \n" |
2593 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale | 2603 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale |
2594 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g | 2604 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g |
2595 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r | 2605 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r |
2596 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size | 2606 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size |
2597 "mul v1.8h, v1.8h, v5.8h \n" // g | 2607 "mul v1.8h, v1.8h, v5.8h \n" // g |
2598 "mul v2.8h, v2.8h, v5.8h \n" // r | 2608 "mul v2.8h, v2.8h, v5.8h \n" // r |
2599 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset | 2609 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset |
(...skipping 23 matching lines...) Expand all Loading... |
2623 uint32 value) { | 2633 uint32 value) { |
2624 asm volatile ( | 2634 asm volatile ( |
2625 "dup v0.4s, %w3 \n" // duplicate scale value. | 2635 "dup v0.4s, %w3 \n" // duplicate scale value. |
2626 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. | 2636 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. |
2627 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. | 2637 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. |
2628 | 2638 |
2629 // 8 pixel loop. | 2639 // 8 pixel loop. |
2630 "1: \n" | 2640 "1: \n" |
2631 MEMACCESS(0) | 2641 MEMACCESS(0) |
2632 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2642 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
2633 "subs %2, %2, #8 \n" // 8 processed per loop. | 2643 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2634 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) | 2644 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) |
2635 "uxtl v5.8h, v5.8b \n" | 2645 "uxtl v5.8h, v5.8b \n" |
2636 "uxtl v6.8h, v6.8b \n" | 2646 "uxtl v6.8h, v6.8b \n" |
2637 "uxtl v7.8h, v7.8b \n" | 2647 "uxtl v7.8h, v7.8b \n" |
2638 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 | 2648 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 |
2639 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g | 2649 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g |
2640 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r | 2650 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r |
2641 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a | 2651 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a |
2642 "uqxtn v4.8b, v4.8h \n" | 2652 "uqxtn v4.8b, v4.8h \n" |
2643 "uqxtn v5.8b, v5.8h \n" | 2653 "uqxtn v5.8b, v5.8h \n" |
(...skipping 16 matching lines...) Expand all Loading... |
2660 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; | 2670 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; |
2661 #ifdef HAS_ARGBGRAYROW_NEON | 2671 #ifdef HAS_ARGBGRAYROW_NEON |
2662 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { | 2672 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { |
2663 asm volatile ( | 2673 asm volatile ( |
2664 "movi v24.8b, #15 \n" // B * 0.11400 coefficient | 2674 "movi v24.8b, #15 \n" // B * 0.11400 coefficient |
2665 "movi v25.8b, #75 \n" // G * 0.58700 coefficient | 2675 "movi v25.8b, #75 \n" // G * 0.58700 coefficient |
2666 "movi v26.8b, #38 \n" // R * 0.29900 coefficient | 2676 "movi v26.8b, #38 \n" // R * 0.29900 coefficient |
2667 "1: \n" | 2677 "1: \n" |
2668 MEMACCESS(0) | 2678 MEMACCESS(0) |
2669 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2679 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
2670 "subs %2, %2, #8 \n" // 8 processed per loop. | 2680 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2671 "umull v4.8h, v0.8b, v24.8b \n" // B | 2681 "umull v4.8h, v0.8b, v24.8b \n" // B |
2672 "umlal v4.8h, v1.8b, v25.8b \n" // G | 2682 "umlal v4.8h, v1.8b, v25.8b \n" // G |
2673 "umlal v4.8h, v2.8b, v26.8b \n" // R | 2683 "umlal v4.8h, v2.8b, v26.8b \n" // R |
2674 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B | 2684 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B |
2675 "orr v1.8b, v0.8b, v0.8b \n" // G | 2685 "orr v1.8b, v0.8b, v0.8b \n" // G |
2676 "orr v2.8b, v0.8b, v0.8b \n" // R | 2686 "orr v2.8b, v0.8b, v0.8b \n" // R |
2677 MEMACCESS(1) | 2687 MEMACCESS(1) |
2678 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. | 2688 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. |
2679 "b.gt 1b \n" | 2689 "b.gt 1b \n" |
2680 : "+r"(src_argb), // %0 | 2690 : "+r"(src_argb), // %0 |
(...skipping 18 matching lines...) Expand all Loading... |
2699 "movi v22.8b, #35 \n" // BR coefficient | 2709 "movi v22.8b, #35 \n" // BR coefficient |
2700 "movi v24.8b, #22 \n" // GB coefficient | 2710 "movi v24.8b, #22 \n" // GB coefficient |
2701 "movi v25.8b, #88 \n" // GG coefficient | 2711 "movi v25.8b, #88 \n" // GG coefficient |
2702 "movi v26.8b, #45 \n" // GR coefficient | 2712 "movi v26.8b, #45 \n" // GR coefficient |
2703 "movi v28.8b, #24 \n" // BB coefficient | 2713 "movi v28.8b, #24 \n" // BB coefficient |
2704 "movi v29.8b, #98 \n" // BG coefficient | 2714 "movi v29.8b, #98 \n" // BG coefficient |
2705 "movi v30.8b, #50 \n" // BR coefficient | 2715 "movi v30.8b, #50 \n" // BR coefficient |
2706 "1: \n" | 2716 "1: \n" |
2707 MEMACCESS(0) | 2717 MEMACCESS(0) |
2708 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. | 2718 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. |
2709 "subs %1, %1, #8 \n" // 8 processed per loop. | 2719 "subs %w1, %w1, #8 \n" // 8 processed per loop. |
2710 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B | 2720 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B |
2711 "umlal v4.8h, v1.8b, v21.8b \n" // G | 2721 "umlal v4.8h, v1.8b, v21.8b \n" // G |
2712 "umlal v4.8h, v2.8b, v22.8b \n" // R | 2722 "umlal v4.8h, v2.8b, v22.8b \n" // R |
2713 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G | 2723 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G |
2714 "umlal v5.8h, v1.8b, v25.8b \n" // G | 2724 "umlal v5.8h, v1.8b, v25.8b \n" // G |
2715 "umlal v5.8h, v2.8b, v26.8b \n" // R | 2725 "umlal v5.8h, v2.8b, v26.8b \n" // R |
2716 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R | 2726 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R |
2717 "umlal v6.8h, v1.8b, v29.8b \n" // G | 2727 "umlal v6.8h, v1.8b, v29.8b \n" // G |
2718 "umlal v6.8h, v2.8b, v30.8b \n" // R | 2728 "umlal v6.8h, v2.8b, v30.8b \n" // R |
2719 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B | 2729 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B |
(...skipping 19 matching lines...) Expand all Loading... |
2739 const int8* matrix_argb, int width) { | 2749 const int8* matrix_argb, int width) { |
2740 asm volatile ( | 2750 asm volatile ( |
2741 MEMACCESS(3) | 2751 MEMACCESS(3) |
2742 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. | 2752 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. |
2743 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. | 2753 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. |
2744 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. | 2754 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. |
2745 | 2755 |
2746 "1: \n" | 2756 "1: \n" |
2747 MEMACCESS(0) | 2757 MEMACCESS(0) |
2748 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. | 2758 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. |
2749 "subs %2, %2, #8 \n" // 8 processed per loop. | 2759 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
2750 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit | 2760 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit |
2751 "uxtl v17.8h, v17.8b \n" // g | 2761 "uxtl v17.8h, v17.8b \n" // g |
2752 "uxtl v18.8h, v18.8b \n" // r | 2762 "uxtl v18.8h, v18.8b \n" // r |
2753 "uxtl v19.8h, v19.8b \n" // a | 2763 "uxtl v19.8h, v19.8b \n" // a |
2754 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B | 2764 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B |
2755 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G | 2765 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G |
2756 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R | 2766 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R |
2757 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A | 2767 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A |
2758 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B | 2768 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B |
2759 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G | 2769 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2801 #ifdef HAS_ARGBMULTIPLYROW_NEON | 2811 #ifdef HAS_ARGBMULTIPLYROW_NEON |
2802 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2812 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
2803 uint8* dst_argb, int width) { | 2813 uint8* dst_argb, int width) { |
2804 asm volatile ( | 2814 asm volatile ( |
2805 // 8 pixel loop. | 2815 // 8 pixel loop. |
2806 "1: \n" | 2816 "1: \n" |
2807 MEMACCESS(0) | 2817 MEMACCESS(0) |
2808 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2818 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
2809 MEMACCESS(1) | 2819 MEMACCESS(1) |
2810 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. | 2820 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. |
2811 "subs %3, %3, #8 \n" // 8 processed per loop. | 2821 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
2812 "umull v0.8h, v0.8b, v4.8b \n" // multiply B | 2822 "umull v0.8h, v0.8b, v4.8b \n" // multiply B |
2813 "umull v1.8h, v1.8b, v5.8b \n" // multiply G | 2823 "umull v1.8h, v1.8b, v5.8b \n" // multiply G |
2814 "umull v2.8h, v2.8b, v6.8b \n" // multiply R | 2824 "umull v2.8h, v2.8b, v6.8b \n" // multiply R |
2815 "umull v3.8h, v3.8b, v7.8b \n" // multiply A | 2825 "umull v3.8h, v3.8b, v7.8b \n" // multiply A |
2816 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B | 2826 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B |
2817 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G | 2827 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G |
2818 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R | 2828 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R |
2819 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A | 2829 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A |
2820 MEMACCESS(2) | 2830 MEMACCESS(2) |
2821 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2831 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
(...skipping 13 matching lines...) Expand all Loading... |
2835 #ifdef HAS_ARGBADDROW_NEON | 2845 #ifdef HAS_ARGBADDROW_NEON |
2836 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2846 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
2837 uint8* dst_argb, int width) { | 2847 uint8* dst_argb, int width) { |
2838 asm volatile ( | 2848 asm volatile ( |
2839 // 8 pixel loop. | 2849 // 8 pixel loop. |
2840 "1: \n" | 2850 "1: \n" |
2841 MEMACCESS(0) | 2851 MEMACCESS(0) |
2842 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2852 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
2843 MEMACCESS(1) | 2853 MEMACCESS(1) |
2844 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. | 2854 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. |
2845 "subs %3, %3, #8 \n" // 8 processed per loop. | 2855 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
2846 "uqadd v0.8b, v0.8b, v4.8b \n" | 2856 "uqadd v0.8b, v0.8b, v4.8b \n" |
2847 "uqadd v1.8b, v1.8b, v5.8b \n" | 2857 "uqadd v1.8b, v1.8b, v5.8b \n" |
2848 "uqadd v2.8b, v2.8b, v6.8b \n" | 2858 "uqadd v2.8b, v2.8b, v6.8b \n" |
2849 "uqadd v3.8b, v3.8b, v7.8b \n" | 2859 "uqadd v3.8b, v3.8b, v7.8b \n" |
2850 MEMACCESS(2) | 2860 MEMACCESS(2) |
2851 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2861 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
2852 "b.gt 1b \n" | 2862 "b.gt 1b \n" |
2853 | 2863 |
2854 : "+r"(src_argb0), // %0 | 2864 : "+r"(src_argb0), // %0 |
2855 "+r"(src_argb1), // %1 | 2865 "+r"(src_argb1), // %1 |
2856 "+r"(dst_argb), // %2 | 2866 "+r"(dst_argb), // %2 |
2857 "+r"(width) // %3 | 2867 "+r"(width) // %3 |
2858 : | 2868 : |
2859 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2869 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
2860 ); | 2870 ); |
2861 } | 2871 } |
2862 #endif // HAS_ARGBADDROW_NEON | 2872 #endif // HAS_ARGBADDROW_NEON |
2863 | 2873 |
2864 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. | 2874 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. |
2865 #ifdef HAS_ARGBSUBTRACTROW_NEON | 2875 #ifdef HAS_ARGBSUBTRACTROW_NEON |
2866 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2876 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
2867 uint8* dst_argb, int width) { | 2877 uint8* dst_argb, int width) { |
2868 asm volatile ( | 2878 asm volatile ( |
2869 // 8 pixel loop. | 2879 // 8 pixel loop. |
2870 "1: \n" | 2880 "1: \n" |
2871 MEMACCESS(0) | 2881 MEMACCESS(0) |
2872 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2882 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
2873 MEMACCESS(1) | 2883 MEMACCESS(1) |
2874 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. | 2884 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. |
2875 "subs %3, %3, #8 \n" // 8 processed per loop. | 2885 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
2876 "uqsub v0.8b, v0.8b, v4.8b \n" | 2886 "uqsub v0.8b, v0.8b, v4.8b \n" |
2877 "uqsub v1.8b, v1.8b, v5.8b \n" | 2887 "uqsub v1.8b, v1.8b, v5.8b \n" |
2878 "uqsub v2.8b, v2.8b, v6.8b \n" | 2888 "uqsub v2.8b, v2.8b, v6.8b \n" |
2879 "uqsub v3.8b, v3.8b, v7.8b \n" | 2889 "uqsub v3.8b, v3.8b, v7.8b \n" |
2880 MEMACCESS(2) | 2890 MEMACCESS(2) |
2881 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2891 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
2882 "b.gt 1b \n" | 2892 "b.gt 1b \n" |
2883 | 2893 |
2884 : "+r"(src_argb0), // %0 | 2894 : "+r"(src_argb0), // %0 |
2885 "+r"(src_argb1), // %1 | 2895 "+r"(src_argb1), // %1 |
(...skipping 14 matching lines...) Expand all Loading... |
2900 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 2910 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
2901 uint8* dst_argb, int width) { | 2911 uint8* dst_argb, int width) { |
2902 asm volatile ( | 2912 asm volatile ( |
2903 "movi v3.8b, #255 \n" // alpha | 2913 "movi v3.8b, #255 \n" // alpha |
2904 // 8 pixel loop. | 2914 // 8 pixel loop. |
2905 "1: \n" | 2915 "1: \n" |
2906 MEMACCESS(0) | 2916 MEMACCESS(0) |
2907 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. | 2917 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. |
2908 MEMACCESS(1) | 2918 MEMACCESS(1) |
2909 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. | 2919 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. |
2910 "subs %3, %3, #8 \n" // 8 processed per loop. | 2920 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
2911 "uqadd v0.8b, v0.8b, v1.8b \n" // add | 2921 "uqadd v0.8b, v0.8b, v1.8b \n" // add |
2912 "orr v1.8b, v0.8b, v0.8b \n" | 2922 "orr v1.8b, v0.8b, v0.8b \n" |
2913 "orr v2.8b, v0.8b, v0.8b \n" | 2923 "orr v2.8b, v0.8b, v0.8b \n" |
2914 MEMACCESS(2) | 2924 MEMACCESS(2) |
2915 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2925 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
2916 "b.gt 1b \n" | 2926 "b.gt 1b \n" |
2917 : "+r"(src_sobelx), // %0 | 2927 : "+r"(src_sobelx), // %0 |
2918 "+r"(src_sobely), // %1 | 2928 "+r"(src_sobely), // %1 |
2919 "+r"(dst_argb), // %2 | 2929 "+r"(dst_argb), // %2 |
2920 "+r"(width) // %3 | 2930 "+r"(width) // %3 |
2921 : | 2931 : |
2922 : "cc", "memory", "v0", "v1", "v2", "v3" | 2932 : "cc", "memory", "v0", "v1", "v2", "v3" |
2923 ); | 2933 ); |
2924 } | 2934 } |
2925 #endif // HAS_SOBELROW_NEON | 2935 #endif // HAS_SOBELROW_NEON |
2926 | 2936 |
2927 // Adds Sobel X and Sobel Y and stores Sobel into plane. | 2937 // Adds Sobel X and Sobel Y and stores Sobel into plane. |
2928 #ifdef HAS_SOBELTOPLANEROW_NEON | 2938 #ifdef HAS_SOBELTOPLANEROW_NEON |
2929 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 2939 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
2930 uint8* dst_y, int width) { | 2940 uint8* dst_y, int width) { |
2931 asm volatile ( | 2941 asm volatile ( |
2932 // 16 pixel loop. | 2942 // 16 pixel loop. |
2933 "1: \n" | 2943 "1: \n" |
2934 MEMACCESS(0) | 2944 MEMACCESS(0) |
2935 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. | 2945 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. |
2936 MEMACCESS(1) | 2946 MEMACCESS(1) |
2937 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. | 2947 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. |
2938 "subs %3, %3, #16 \n" // 16 processed per loop. | 2948 "subs %w3, %w3, #16 \n" // 16 processed per loop. |
2939 "uqadd v0.16b, v0.16b, v1.16b \n" // add | 2949 "uqadd v0.16b, v0.16b, v1.16b \n" // add |
2940 MEMACCESS(2) | 2950 MEMACCESS(2) |
2941 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. | 2951 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. |
2942 "b.gt 1b \n" | 2952 "b.gt 1b \n" |
2943 : "+r"(src_sobelx), // %0 | 2953 : "+r"(src_sobelx), // %0 |
2944 "+r"(src_sobely), // %1 | 2954 "+r"(src_sobely), // %1 |
2945 "+r"(dst_y), // %2 | 2955 "+r"(dst_y), // %2 |
2946 "+r"(width) // %3 | 2956 "+r"(width) // %3 |
2947 : | 2957 : |
2948 : "cc", "memory", "v0", "v1" | 2958 : "cc", "memory", "v0", "v1" |
(...skipping 10 matching lines...) Expand all Loading... |
2959 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 2969 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
2960 uint8* dst_argb, int width) { | 2970 uint8* dst_argb, int width) { |
2961 asm volatile ( | 2971 asm volatile ( |
2962 "movi v3.8b, #255 \n" // alpha | 2972 "movi v3.8b, #255 \n" // alpha |
2963 // 8 pixel loop. | 2973 // 8 pixel loop. |
2964 "1: \n" | 2974 "1: \n" |
2965 MEMACCESS(0) | 2975 MEMACCESS(0) |
2966 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. | 2976 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. |
2967 MEMACCESS(1) | 2977 MEMACCESS(1) |
2968 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. | 2978 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. |
2969 "subs %3, %3, #8 \n" // 8 processed per loop. | 2979 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
2970 "uqadd v1.8b, v0.8b, v2.8b \n" // add | 2980 "uqadd v1.8b, v0.8b, v2.8b \n" // add |
2971 MEMACCESS(2) | 2981 MEMACCESS(2) |
2972 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2982 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
2973 "b.gt 1b \n" | 2983 "b.gt 1b \n" |
2974 : "+r"(src_sobelx), // %0 | 2984 : "+r"(src_sobelx), // %0 |
2975 "+r"(src_sobely), // %1 | 2985 "+r"(src_sobely), // %1 |
2976 "+r"(dst_argb), // %2 | 2986 "+r"(dst_argb), // %2 |
2977 "+r"(width) // %3 | 2987 "+r"(width) // %3 |
2978 : | 2988 : |
2979 : "cc", "memory", "v0", "v1", "v2", "v3" | 2989 : "cc", "memory", "v0", "v1", "v2", "v3" |
(...skipping 19 matching lines...) Expand all Loading... |
2999 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 | 3009 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 |
3000 MEMACCESS(1) | 3010 MEMACCESS(1) |
3001 "ld1 {v3.8b}, [%1],%6 \n" | 3011 "ld1 {v3.8b}, [%1],%6 \n" |
3002 "usubl v1.8h, v2.8b, v3.8b \n" | 3012 "usubl v1.8h, v2.8b, v3.8b \n" |
3003 "add v0.8h, v0.8h, v1.8h \n" | 3013 "add v0.8h, v0.8h, v1.8h \n" |
3004 "add v0.8h, v0.8h, v1.8h \n" | 3014 "add v0.8h, v0.8h, v1.8h \n" |
3005 MEMACCESS(2) | 3015 MEMACCESS(2) |
3006 "ld1 {v2.8b}, [%2],%5 \n" // bottom | 3016 "ld1 {v2.8b}, [%2],%5 \n" // bottom |
3007 MEMACCESS(2) | 3017 MEMACCESS(2) |
3008 "ld1 {v3.8b}, [%2],%6 \n" | 3018 "ld1 {v3.8b}, [%2],%6 \n" |
3009 "subs %4, %4, #8 \n" // 8 pixels | 3019 "subs %w4, %w4, #8 \n" // 8 pixels |
3010 "usubl v1.8h, v2.8b, v3.8b \n" | 3020 "usubl v1.8h, v2.8b, v3.8b \n" |
3011 "add v0.8h, v0.8h, v1.8h \n" | 3021 "add v0.8h, v0.8h, v1.8h \n" |
3012 "abs v0.8h, v0.8h \n" | 3022 "abs v0.8h, v0.8h \n" |
3013 "uqxtn v0.8b, v0.8h \n" | 3023 "uqxtn v0.8b, v0.8h \n" |
3014 MEMACCESS(3) | 3024 MEMACCESS(3) |
3015 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx | 3025 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx |
3016 "b.gt 1b \n" | 3026 "b.gt 1b \n" |
3017 : "+r"(src_y0), // %0 | 3027 : "+r"(src_y0), // %0 |
3018 "+r"(src_y1), // %1 | 3028 "+r"(src_y1), // %1 |
3019 "+r"(src_y2), // %2 | 3029 "+r"(src_y2), // %2 |
3020 "+r"(dst_sobelx), // %3 | 3030 "+r"(dst_sobelx), // %3 |
3021 "+r"(width) // %4 | 3031 "+r"(width) // %4 |
3022 : "r"(2), // %5 | 3032 : "r"(2LL), // %5 |
3023 "r"(6) // %6 | 3033 "r"(6LL) // %6 |
3024 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 3034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
3025 ); | 3035 ); |
3026 } | 3036 } |
3027 #endif // HAS_SOBELXROW_NEON | 3037 #endif // HAS_SOBELXROW_NEON |
3028 | 3038 |
3029 // SobelY as a matrix is | 3039 // SobelY as a matrix is |
3030 // -1 -2 -1 | 3040 // -1 -2 -1 |
3031 // 0 0 0 | 3041 // 0 0 0 |
3032 // 1 2 1 | 3042 // 1 2 1 |
3033 #ifdef HAS_SOBELYROW_NEON | 3043 #ifdef HAS_SOBELYROW_NEON |
(...skipping 10 matching lines...) Expand all Loading... |
3044 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 | 3054 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 |
3045 MEMACCESS(1) | 3055 MEMACCESS(1) |
3046 "ld1 {v3.8b}, [%1],%4 \n" | 3056 "ld1 {v3.8b}, [%1],%4 \n" |
3047 "usubl v1.8h, v2.8b, v3.8b \n" | 3057 "usubl v1.8h, v2.8b, v3.8b \n" |
3048 "add v0.8h, v0.8h, v1.8h \n" | 3058 "add v0.8h, v0.8h, v1.8h \n" |
3049 "add v0.8h, v0.8h, v1.8h \n" | 3059 "add v0.8h, v0.8h, v1.8h \n" |
3050 MEMACCESS(0) | 3060 MEMACCESS(0) |
3051 "ld1 {v2.8b}, [%0],%5 \n" // right | 3061 "ld1 {v2.8b}, [%0],%5 \n" // right |
3052 MEMACCESS(1) | 3062 MEMACCESS(1) |
3053 "ld1 {v3.8b}, [%1],%5 \n" | 3063 "ld1 {v3.8b}, [%1],%5 \n" |
3054 "subs %3, %3, #8 \n" // 8 pixels | 3064 "subs %w3, %w3, #8 \n" // 8 pixels |
3055 "usubl v1.8h, v2.8b, v3.8b \n" | 3065 "usubl v1.8h, v2.8b, v3.8b \n" |
3056 "add v0.8h, v0.8h, v1.8h \n" | 3066 "add v0.8h, v0.8h, v1.8h \n" |
3057 "abs v0.8h, v0.8h \n" | 3067 "abs v0.8h, v0.8h \n" |
3058 "uqxtn v0.8b, v0.8h \n" | 3068 "uqxtn v0.8b, v0.8h \n" |
3059 MEMACCESS(2) | 3069 MEMACCESS(2) |
3060 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely | 3070 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely |
3061 "b.gt 1b \n" | 3071 "b.gt 1b \n" |
3062 : "+r"(src_y0), // %0 | 3072 : "+r"(src_y0), // %0 |
3063 "+r"(src_y1), // %1 | 3073 "+r"(src_y1), // %1 |
3064 "+r"(dst_sobely), // %2 | 3074 "+r"(dst_sobely), // %2 |
3065 "+r"(width) // %3 | 3075 "+r"(width) // %3 |
3066 : "r"(1), // %4 | 3076 : "r"(1LL), // %4 |
3067 "r"(6) // %5 | 3077 "r"(6LL) // %5 |
3068 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 3078 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
3069 ); | 3079 ); |
3070 } | 3080 } |
3071 #endif // HAS_SOBELYROW_NEON | 3081 #endif // HAS_SOBELYROW_NEON |
3072 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 3082 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
3073 | 3083 |
3074 #ifdef __cplusplus | 3084 #ifdef __cplusplus |
3075 } // extern "C" | 3085 } // extern "C" |
3076 } // namespace libyuv | 3086 } // namespace libyuv |
3077 #endif | 3087 #endif |
OLD | NEW |