| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "libyuv/row.h" | 11 #include "libyuv/row.h" |
| 12 | 12 |
| 13 #ifdef __cplusplus | 13 #ifdef __cplusplus |
| 14 namespace libyuv { | 14 namespace libyuv { |
| 15 extern "C" { | 15 extern "C" { |
| 16 #endif | 16 #endif |
| 17 | 17 |
| 18 // This module is for GCC Neon armv8 64 bit. | 18 // This module is for GCC Neon armv8 64 bit. |
| 19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 20 | 20 |
| 21 // Read 8 Y, 4 U and 4 V from 422 | 21 // Read 8 Y, 4 U and 4 V from 422 |
| 22 #define READYUV422 \ | 22 #define READYUV422 \ |
| 23 MEMACCESS(0) \ | 23 MEMACCESS(0) \ |
| 24 "ld1 {v0.8b}, [%0], #8 \n" \ | 24 "ld1 {v0.8b}, [%0], #8 \n" \ |
| 25 MEMACCESS(1) \ | 25 MEMACCESS(1) \ |
| 26 "ld1 {v1.s}[0], [%1], #4 \n" \ | 26 "ld1 {v1.s}[0], [%1], #4 \n" \ |
| 27 MEMACCESS(2) \ | 27 MEMACCESS(2) \ |
| 28 "ld1 {v1.s}[1], [%2], #4 \n" | 28 "ld1 {v1.s}[1], [%2], #4 \n" |
| 29 | 29 |
| 30 // Read 8 Y, 2 U and 2 V from 422 | |
| 31 #define READYUV411 \ | |
| 32 MEMACCESS(0) \ | |
| 33 "ld1 {v0.8b}, [%0], #8 \n" \ | |
| 34 MEMACCESS(1) \ | |
| 35 "ld1 {v2.h}[0], [%1], #2 \n" \ | |
| 36 MEMACCESS(2) \ | |
| 37 "ld1 {v2.h}[1], [%2], #2 \n" \ | |
| 38 "zip1 v1.8b, v2.8b, v2.8b \n" | |
| 39 | |
| 40 // Read 8 Y, 8 U and 8 V from 444 | 30 // Read 8 Y, 8 U and 8 V from 444 |
| 41 #define READYUV444 \ | 31 #define READYUV444 \ |
| 42 MEMACCESS(0) \ | 32 MEMACCESS(0) \ |
| 43 "ld1 {v0.8b}, [%0], #8 \n" \ | 33 "ld1 {v0.8b}, [%0], #8 \n" \ |
| 44 MEMACCESS(1) \ | 34 MEMACCESS(1) \ |
| 45 "ld1 {v1.d}[0], [%1], #8 \n" \ | 35 "ld1 {v1.d}[0], [%1], #8 \n" \ |
| 46 MEMACCESS(2) \ | 36 MEMACCESS(2) \ |
| 47 "ld1 {v1.d}[1], [%2], #8 \n" \ | 37 "ld1 {v1.d}[1], [%2], #8 \n" \ |
| 48 "uaddlp v1.8h, v1.16b \n" \ | 38 "uaddlp v1.8h, v1.16b \n" \ |
| 49 "rshrn v1.8b, v1.8h, #1 \n" | 39 "rshrn v1.8b, v1.8h, #1 \n" |
| (...skipping 163 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 213 "+r"(width) // %5 | 203 "+r"(width) // %5 |
| 214 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 204 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 215 [kUVToG]"r"(&yuvconstants->kUVToG), | 205 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 216 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 206 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 217 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 207 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 218 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 208 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 219 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 209 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 220 ); | 210 ); |
| 221 } | 211 } |
| 222 | 212 |
| 223 void I411ToARGBRow_NEON(const uint8* src_y, | |
| 224 const uint8* src_u, | |
| 225 const uint8* src_v, | |
| 226 uint8* dst_argb, | |
| 227 const struct YuvConstants* yuvconstants, | |
| 228 int width) { | |
| 229 asm volatile ( | |
| 230 YUVTORGB_SETUP | |
| 231 "movi v23.8b, #255 \n" /* A */ | |
| 232 "1: \n" | |
| 233 READYUV411 | |
| 234 YUVTORGB(v22, v21, v20) | |
| 235 "subs %w4, %w4, #8 \n" | |
| 236 MEMACCESS(3) | |
| 237 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | |
| 238 "b.gt 1b \n" | |
| 239 : "+r"(src_y), // %0 | |
| 240 "+r"(src_u), // %1 | |
| 241 "+r"(src_v), // %2 | |
| 242 "+r"(dst_argb), // %3 | |
| 243 "+r"(width) // %4 | |
| 244 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | |
| 245 [kUVToG]"r"(&yuvconstants->kUVToG), | |
| 246 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | |
| 247 [kYToRgb]"r"(&yuvconstants->kYToRgb) | |
| 248 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | |
| 249 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | |
| 250 ); | |
| 251 } | |
| 252 | |
| 253 void I422ToRGBARow_NEON(const uint8* src_y, | 213 void I422ToRGBARow_NEON(const uint8* src_y, |
| 254 const uint8* src_u, | 214 const uint8* src_u, |
| 255 const uint8* src_v, | 215 const uint8* src_v, |
| 256 uint8* dst_rgba, | 216 uint8* dst_rgba, |
| 257 const struct YuvConstants* yuvconstants, | 217 const struct YuvConstants* yuvconstants, |
| 258 int width) { | 218 int width) { |
| 259 asm volatile ( | 219 asm volatile ( |
| 260 YUVTORGB_SETUP | 220 YUVTORGB_SETUP |
| 261 "movi v20.8b, #255 \n" /* A */ | 221 "movi v20.8b, #255 \n" /* A */ |
| 262 "1: \n" | 222 "1: \n" |
| (...skipping 1125 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1388 } | 1348 } |
| 1389 | 1349 |
| 1390 #define RGBTOUV_SETUP_REG \ | 1350 #define RGBTOUV_SETUP_REG \ |
| 1391 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ | 1351 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ |
| 1392 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ | 1352 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ |
| 1393 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ | 1353 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ |
| 1394 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ | 1354 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ |
| 1395 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ | 1355 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ |
| 1396 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ | 1356 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ |
| 1397 | 1357 |
| 1398 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. | |
| 1399 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | |
| 1400 int width) { | |
| 1401 asm volatile ( | |
| 1402 RGBTOUV_SETUP_REG | |
| 1403 "1: \n" | |
| 1404 MEMACCESS(0) | |
| 1405 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | |
| 1406 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | |
| 1407 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | |
| 1408 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | |
| 1409 MEMACCESS(0) | |
| 1410 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. | |
| 1411 "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. | |
| 1412 "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. | |
| 1413 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. | |
| 1414 | |
| 1415 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. | |
| 1416 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. | |
| 1417 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. | |
| 1418 | |
| 1419 "urshr v0.8h, v0.8h, #1 \n" // 2x average | |
| 1420 "urshr v1.8h, v1.8h, #1 \n" | |
| 1421 "urshr v2.8h, v2.8h, #1 \n" | |
| 1422 | |
| 1423 "subs %w3, %w3, #32 \n" // 32 processed per loop. | |
| 1424 "mul v3.8h, v0.8h, v20.8h \n" // B | |
| 1425 "mls v3.8h, v1.8h, v21.8h \n" // G | |
| 1426 "mls v3.8h, v2.8h, v22.8h \n" // R | |
| 1427 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned | |
| 1428 "mul v4.8h, v2.8h, v20.8h \n" // R | |
| 1429 "mls v4.8h, v1.8h, v24.8h \n" // G | |
| 1430 "mls v4.8h, v0.8h, v23.8h \n" // B | |
| 1431 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned | |
| 1432 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U | |
| 1433 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V | |
| 1434 MEMACCESS(1) | |
| 1435 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. | |
| 1436 MEMACCESS(2) | |
| 1437 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. | |
| 1438 "b.gt 1b \n" | |
| 1439 : "+r"(src_argb), // %0 | |
| 1440 "+r"(dst_u), // %1 | |
| 1441 "+r"(dst_v), // %2 | |
| 1442 "+r"(width) // %3 | |
| 1443 : | |
| 1444 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | |
| 1445 "v20", "v21", "v22", "v23", "v24", "v25" | |
| 1446 ); | |
| 1447 } | |
| 1448 | |
| 1449 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1358 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
| 1450 #define RGBTOUV(QB, QG, QR) \ | 1359 #define RGBTOUV(QB, QG, QR) \ |
| 1451 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ | 1360 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ |
| 1452 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ | 1361 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ |
| 1453 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ | 1362 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ |
| 1454 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ | 1363 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ |
| 1455 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ | 1364 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ |
| 1456 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ | 1365 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ |
| 1457 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1366 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ |
| 1458 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1367 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ |
| (...skipping 1341 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2800 "r"(6LL) // %5 | 2709 "r"(6LL) // %5 |
| 2801 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 2710 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 2802 ); | 2711 ); |
| 2803 } | 2712 } |
| 2804 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 2713 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 2805 | 2714 |
| 2806 #ifdef __cplusplus | 2715 #ifdef __cplusplus |
| 2807 } // extern "C" | 2716 } // extern "C" |
| 2808 } // namespace libyuv | 2717 } // namespace libyuv |
| 2809 #endif | 2718 #endif |
| OLD | NEW |