OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 12 matching lines...) Expand all Loading... |
23 | 23 |
24 // Read 8 Y, 4 U and 4 V from 422 | 24 // Read 8 Y, 4 U and 4 V from 422 |
25 #define READYUV422 \ | 25 #define READYUV422 \ |
26 MEMACCESS(0) \ | 26 MEMACCESS(0) \ |
27 "vld1.8 {d0}, [%0]! \n" \ | 27 "vld1.8 {d0}, [%0]! \n" \ |
28 MEMACCESS(1) \ | 28 MEMACCESS(1) \ |
29 "vld1.32 {d2[0]}, [%1]! \n" \ | 29 "vld1.32 {d2[0]}, [%1]! \n" \ |
30 MEMACCESS(2) \ | 30 MEMACCESS(2) \ |
31 "vld1.32 {d2[1]}, [%2]! \n" | 31 "vld1.32 {d2[1]}, [%2]! \n" |
32 | 32 |
33 // Read 8 Y, 2 U and 2 V from 422 | |
34 #define READYUV411 \ | |
35 MEMACCESS(0) \ | |
36 "vld1.8 {d0}, [%0]! \n" \ | |
37 MEMACCESS(1) \ | |
38 "vld1.16 {d2[0]}, [%1]! \n" \ | |
39 MEMACCESS(2) \ | |
40 "vld1.16 {d2[1]}, [%2]! \n" \ | |
41 "vmov.u8 d3, d2 \n" \ | |
42 "vzip.u8 d2, d3 \n" | |
43 | |
44 // Read 8 Y, 8 U and 8 V from 444 | 33 // Read 8 Y, 8 U and 8 V from 444 |
45 #define READYUV444 \ | 34 #define READYUV444 \ |
46 MEMACCESS(0) \ | 35 MEMACCESS(0) \ |
47 "vld1.8 {d0}, [%0]! \n" \ | 36 "vld1.8 {d0}, [%0]! \n" \ |
48 MEMACCESS(1) \ | 37 MEMACCESS(1) \ |
49 "vld1.8 {d2}, [%1]! \n" \ | 38 "vld1.8 {d2}, [%1]! \n" \ |
50 MEMACCESS(2) \ | 39 MEMACCESS(2) \ |
51 "vld1.8 {d3}, [%2]! \n" \ | 40 "vld1.8 {d3}, [%2]! \n" \ |
52 "vpaddl.u8 q1, q1 \n" \ | 41 "vpaddl.u8 q1, q1 \n" \ |
53 "vrshrn.u16 d2, q1, #1 \n" | 42 "vrshrn.u16 d2, q1, #1 \n" |
(...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
222 "+r"(width) // %5 | 211 "+r"(width) // %5 |
223 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 212 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
224 [kUVToG]"r"(&yuvconstants->kUVToG), | 213 [kUVToG]"r"(&yuvconstants->kUVToG), |
225 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 214 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
226 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 215 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
227 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", | 216 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", |
228 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | 217 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
229 ); | 218 ); |
230 } | 219 } |
231 | 220 |
232 void I411ToARGBRow_NEON(const uint8* src_y, | |
233 const uint8* src_u, | |
234 const uint8* src_v, | |
235 uint8* dst_argb, | |
236 const struct YuvConstants* yuvconstants, | |
237 int width) { | |
238 asm volatile ( | |
239 YUVTORGB_SETUP | |
240 "vmov.u8 d23, #255 \n" | |
241 "1: \n" | |
242 READYUV411 | |
243 YUVTORGB | |
244 "subs %4, %4, #8 \n" | |
245 MEMACCESS(3) | |
246 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" | |
247 "bgt 1b \n" | |
248 : "+r"(src_y), // %0 | |
249 "+r"(src_u), // %1 | |
250 "+r"(src_v), // %2 | |
251 "+r"(dst_argb), // %3 | |
252 "+r"(width) // %4 | |
253 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | |
254 [kUVToG]"r"(&yuvconstants->kUVToG), | |
255 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | |
256 [kYToRgb]"r"(&yuvconstants->kYToRgb) | |
257 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", | |
258 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | |
259 ); | |
260 } | |
261 | |
262 void I422ToRGBARow_NEON(const uint8* src_y, | 221 void I422ToRGBARow_NEON(const uint8* src_y, |
263 const uint8* src_u, | 222 const uint8* src_u, |
264 const uint8* src_v, | 223 const uint8* src_v, |
265 uint8* dst_rgba, | 224 uint8* dst_rgba, |
266 const struct YuvConstants* yuvconstants, | 225 const struct YuvConstants* yuvconstants, |
267 int width) { | 226 int width) { |
268 asm volatile ( | 227 asm volatile ( |
269 YUVTORGB_SETUP | 228 YUVTORGB_SETUP |
270 "1: \n" | 229 "1: \n" |
271 READYUV422 | 230 READYUV422 |
(...skipping 1104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1376 "bgt 1b \n" | 1335 "bgt 1b \n" |
1377 : "+r"(src_argb), // %0 | 1336 : "+r"(src_argb), // %0 |
1378 "+r"(dst_u), // %1 | 1337 "+r"(dst_u), // %1 |
1379 "+r"(dst_v), // %2 | 1338 "+r"(dst_v), // %2 |
1380 "+r"(width) // %3 | 1339 "+r"(width) // %3 |
1381 : | 1340 : |
1382 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" | 1341 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" |
1383 ); | 1342 ); |
1384 } | 1343 } |
1385 | 1344 |
1386 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. | |
1387 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | |
1388 int width) { | |
1389 asm volatile ( | |
1390 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient | |
1391 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient | |
1392 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient | |
1393 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient | |
1394 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient | |
1395 "vmov.u16 q15, #0x8080 \n" // 128.5 | |
1396 "1: \n" | |
1397 MEMACCESS(0) | |
1398 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. | |
1399 MEMACCESS(0) | |
1400 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. | |
1401 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. | |
1402 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. | |
1403 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. | |
1404 MEMACCESS(0) | |
1405 "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. | |
1406 MEMACCESS(0) | |
1407 "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. | |
1408 "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. | |
1409 "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. | |
1410 "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. | |
1411 | |
1412 "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. | |
1413 "vpadd.u16 d1, d8, d9 \n" // B | |
1414 "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. | |
1415 "vpadd.u16 d3, d10, d11 \n" // G | |
1416 "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. | |
1417 "vpadd.u16 d5, d12, d13 \n" // R | |
1418 | |
1419 "vrshr.u16 q0, q0, #1 \n" // 2x average | |
1420 "vrshr.u16 q1, q1, #1 \n" | |
1421 "vrshr.u16 q2, q2, #1 \n" | |
1422 | |
1423 "subs %3, %3, #32 \n" // 32 processed per loop. | |
1424 "vmul.s16 q8, q0, q10 \n" // B | |
1425 "vmls.s16 q8, q1, q11 \n" // G | |
1426 "vmls.s16 q8, q2, q12 \n" // R | |
1427 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned | |
1428 "vmul.s16 q9, q2, q10 \n" // R | |
1429 "vmls.s16 q9, q1, q14 \n" // G | |
1430 "vmls.s16 q9, q0, q13 \n" // B | |
1431 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned | |
1432 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U | |
1433 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V | |
1434 MEMACCESS(1) | |
1435 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. | |
1436 MEMACCESS(2) | |
1437 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. | |
1438 "bgt 1b \n" | |
1439 : "+r"(src_argb), // %0 | |
1440 "+r"(dst_u), // %1 | |
1441 "+r"(dst_v), // %2 | |
1442 "+r"(width) // %3 | |
1443 : | |
1444 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", | |
1445 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" | |
1446 ); | |
1447 } | |
1448 | |
1449 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1345 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
1450 #define RGBTOUV(QB, QG, QR) \ | 1346 #define RGBTOUV(QB, QG, QR) \ |
1451 "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ | 1347 "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ |
1452 "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ | 1348 "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ |
1453 "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ | 1349 "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ |
1454 "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ | 1350 "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ |
1455 "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ | 1351 "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ |
1456 "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ | 1352 "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ |
1457 "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ | 1353 "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ |
1458 "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ | 1354 "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ |
(...skipping 1377 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2836 "r"(6) // %5 | 2732 "r"(6) // %5 |
2837 : "cc", "memory", "q0", "q1" // Clobber List | 2733 : "cc", "memory", "q0", "q1" // Clobber List |
2838 ); | 2734 ); |
2839 } | 2735 } |
2840 #endif // defined(__ARM_NEON__) && !defined(__aarch64__) | 2736 #endif // defined(__ARM_NEON__) && !defined(__aarch64__) |
2841 | 2737 |
2842 #ifdef __cplusplus | 2738 #ifdef __cplusplus |
2843 } // extern "C" | 2739 } // extern "C" |
2844 } // namespace libyuv | 2740 } // namespace libyuv |
2845 #endif | 2741 #endif |
OLD | NEW |