OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "libyuv/row.h" | 11 #include "libyuv/row.h" |
12 | 12 |
13 #ifdef __cplusplus | 13 #ifdef __cplusplus |
14 namespace libyuv { | 14 namespace libyuv { |
15 extern "C" { | 15 extern "C" { |
16 #endif | 16 #endif |
17 | 17 |
18 // This module is for GCC Neon armv8 64 bit. | 18 // This module is for GCC Neon armv8 64 bit. |
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
20 | 20 |
21 // Read 8 Y, 4 U and 4 V from 422 | 21 // Read 8 Y, 4 U and 4 V from 422 |
22 #define READYUV422 \ | 22 #define READYUV422 \ |
23 MEMACCESS(0) \ | 23 MEMACCESS(0) \ |
24 "ld1 {v0.8b}, [%0], #8 \n" \ | 24 "ld1 {v0.8b}, [%0], #8 \n" \ |
25 MEMACCESS(1) \ | 25 MEMACCESS(1) \ |
26 "ld1 {v1.s}[0], [%1], #4 \n" \ | 26 "ld1 {v1.s}[0], [%1], #4 \n" \ |
27 MEMACCESS(2) \ | 27 MEMACCESS(2) \ |
28 "ld1 {v1.s}[1], [%2], #4 \n" | 28 "ld1 {v1.s}[1], [%2], #4 \n" |
29 | 29 |
30 // Read 8 Y, 2 U and 2 V from 422 | |
31 #define READYUV411 \ | |
32 MEMACCESS(0) \ | |
33 "ld1 {v0.8b}, [%0], #8 \n" \ | |
34 MEMACCESS(1) \ | |
35 "ld1 {v2.h}[0], [%1], #2 \n" \ | |
36 MEMACCESS(2) \ | |
37 "ld1 {v2.h}[1], [%2], #2 \n" \ | |
38 "zip1 v1.8b, v2.8b, v2.8b \n" | |
39 | |
40 // Read 8 Y, 8 U and 8 V from 444 | 30 // Read 8 Y, 8 U and 8 V from 444 |
41 #define READYUV444 \ | 31 #define READYUV444 \ |
42 MEMACCESS(0) \ | 32 MEMACCESS(0) \ |
43 "ld1 {v0.8b}, [%0], #8 \n" \ | 33 "ld1 {v0.8b}, [%0], #8 \n" \ |
44 MEMACCESS(1) \ | 34 MEMACCESS(1) \ |
45 "ld1 {v1.d}[0], [%1], #8 \n" \ | 35 "ld1 {v1.d}[0], [%1], #8 \n" \ |
46 MEMACCESS(2) \ | 36 MEMACCESS(2) \ |
47 "ld1 {v1.d}[1], [%2], #8 \n" \ | 37 "ld1 {v1.d}[1], [%2], #8 \n" \ |
48 "uaddlp v1.8h, v1.16b \n" \ | 38 "uaddlp v1.8h, v1.16b \n" \ |
49 "rshrn v1.8b, v1.8h, #1 \n" | 39 "rshrn v1.8b, v1.8h, #1 \n" |
(...skipping 163 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
213 "+r"(width) // %5 | 203 "+r"(width) // %5 |
214 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 204 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
215 [kUVToG]"r"(&yuvconstants->kUVToG), | 205 [kUVToG]"r"(&yuvconstants->kUVToG), |
216 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 206 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
217 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 207 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
218 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 208 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
219 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 209 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
220 ); | 210 ); |
221 } | 211 } |
222 | 212 |
223 void I411ToARGBRow_NEON(const uint8* src_y, | |
224 const uint8* src_u, | |
225 const uint8* src_v, | |
226 uint8* dst_argb, | |
227 const struct YuvConstants* yuvconstants, | |
228 int width) { | |
229 asm volatile ( | |
230 YUVTORGB_SETUP | |
231 "movi v23.8b, #255 \n" /* A */ | |
232 "1: \n" | |
233 READYUV411 | |
234 YUVTORGB(v22, v21, v20) | |
235 "subs %w4, %w4, #8 \n" | |
236 MEMACCESS(3) | |
237 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | |
238 "b.gt 1b \n" | |
239 : "+r"(src_y), // %0 | |
240 "+r"(src_u), // %1 | |
241 "+r"(src_v), // %2 | |
242 "+r"(dst_argb), // %3 | |
243 "+r"(width) // %4 | |
244 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | |
245 [kUVToG]"r"(&yuvconstants->kUVToG), | |
246 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | |
247 [kYToRgb]"r"(&yuvconstants->kYToRgb) | |
248 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | |
249 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | |
250 ); | |
251 } | |
252 | |
253 void I422ToRGBARow_NEON(const uint8* src_y, | 213 void I422ToRGBARow_NEON(const uint8* src_y, |
254 const uint8* src_u, | 214 const uint8* src_u, |
255 const uint8* src_v, | 215 const uint8* src_v, |
256 uint8* dst_rgba, | 216 uint8* dst_rgba, |
257 const struct YuvConstants* yuvconstants, | 217 const struct YuvConstants* yuvconstants, |
258 int width) { | 218 int width) { |
259 asm volatile ( | 219 asm volatile ( |
260 YUVTORGB_SETUP | 220 YUVTORGB_SETUP |
261 "movi v20.8b, #255 \n" /* A */ | 221 "movi v20.8b, #255 \n" /* A */ |
262 "1: \n" | 222 "1: \n" |
(...skipping 1125 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1388 } | 1348 } |
1389 | 1349 |
1390 #define RGBTOUV_SETUP_REG \ | 1350 #define RGBTOUV_SETUP_REG \ |
1391 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ | 1351 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ |
1392 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ | 1352 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ |
1393 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ | 1353 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ |
1394 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ | 1354 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ |
1395 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ | 1355 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ |
1396 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ | 1356 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ |
1397 | 1357 |
1398 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. | |
1399 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | |
1400 int width) { | |
1401 asm volatile ( | |
1402 RGBTOUV_SETUP_REG | |
1403 "1: \n" | |
1404 MEMACCESS(0) | |
1405 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | |
1406 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | |
1407 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | |
1408 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | |
1409 MEMACCESS(0) | |
1410 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. | |
1411 "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. | |
1412 "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. | |
1413 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. | |
1414 | |
1415 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. | |
1416 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. | |
1417 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. | |
1418 | |
1419 "urshr v0.8h, v0.8h, #1 \n" // 2x average | |
1420 "urshr v1.8h, v1.8h, #1 \n" | |
1421 "urshr v2.8h, v2.8h, #1 \n" | |
1422 | |
1423 "subs %w3, %w3, #32 \n" // 32 processed per loop. | |
1424 "mul v3.8h, v0.8h, v20.8h \n" // B | |
1425 "mls v3.8h, v1.8h, v21.8h \n" // G | |
1426 "mls v3.8h, v2.8h, v22.8h \n" // R | |
1427 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned | |
1428 "mul v4.8h, v2.8h, v20.8h \n" // R | |
1429 "mls v4.8h, v1.8h, v24.8h \n" // G | |
1430 "mls v4.8h, v0.8h, v23.8h \n" // B | |
1431 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned | |
1432 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U | |
1433 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V | |
1434 MEMACCESS(1) | |
1435 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. | |
1436 MEMACCESS(2) | |
1437 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. | |
1438 "b.gt 1b \n" | |
1439 : "+r"(src_argb), // %0 | |
1440 "+r"(dst_u), // %1 | |
1441 "+r"(dst_v), // %2 | |
1442 "+r"(width) // %3 | |
1443 : | |
1444 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | |
1445 "v20", "v21", "v22", "v23", "v24", "v25" | |
1446 ); | |
1447 } | |
1448 | |
1449 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1358 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
1450 #define RGBTOUV(QB, QG, QR) \ | 1359 #define RGBTOUV(QB, QG, QR) \ |
1451 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ | 1360 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ |
1452 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ | 1361 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ |
1453 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ | 1362 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ |
1454 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ | 1363 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ |
1455 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ | 1364 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ |
1456 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ | 1365 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ |
1457 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1366 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ |
1458 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1367 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ |
(...skipping 1341 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2800 "r"(6LL) // %5 | 2709 "r"(6LL) // %5 |
2801 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 2710 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
2802 ); | 2711 ); |
2803 } | 2712 } |
2804 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 2713 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
2805 | 2714 |
2806 #ifdef __cplusplus | 2715 #ifdef __cplusplus |
2807 } // extern "C" | 2716 } // extern "C" |
2808 } // namespace libyuv | 2717 } // namespace libyuv |
2809 #endif | 2718 #endif |
OLD | NEW |