OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
229 "jg 1b \n" | 229 "jg 1b \n" |
230 : "+r"(src_y), // %0 | 230 : "+r"(src_y), // %0 |
231 "+r"(dst_argb), // %1 | 231 "+r"(dst_argb), // %1 |
232 "+r"(pix) // %2 | 232 "+r"(pix) // %2 |
233 : | 233 : |
234 : "memory", "cc", "xmm0", "xmm1", "xmm5" | 234 : "memory", "cc", "xmm0", "xmm1", "xmm5" |
235 ); | 235 ); |
236 } | 236 } |
237 #endif // TESTING | 237 #endif // TESTING |
238 | 238 |
239 #ifdef HAS_I400TOARGBROW_SSE2 | 239 #ifdef HAS_J400TOARGBROW_SSE2 |
240 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 240 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
241 asm volatile ( | 241 asm volatile ( |
242 "pcmpeqb %%xmm5,%%xmm5 \n" | 242 "pcmpeqb %%xmm5,%%xmm5 \n" |
243 "pslld $0x18,%%xmm5 \n" | 243 "pslld $0x18,%%xmm5 \n" |
244 LABELALIGN | 244 LABELALIGN |
245 "1: \n" | 245 "1: \n" |
246 "movq " MEMACCESS(0) ",%%xmm0 \n" | 246 "movq " MEMACCESS(0) ",%%xmm0 \n" |
247 "lea " MEMLEA(0x8,0) ",%0 \n" | 247 "lea " MEMLEA(0x8,0) ",%0 \n" |
248 "punpcklbw %%xmm0,%%xmm0 \n" | 248 "punpcklbw %%xmm0,%%xmm0 \n" |
249 "movdqa %%xmm0,%%xmm1 \n" | 249 "movdqa %%xmm0,%%xmm1 \n" |
250 "punpcklwd %%xmm0,%%xmm0 \n" | 250 "punpcklwd %%xmm0,%%xmm0 \n" |
251 "punpckhwd %%xmm1,%%xmm1 \n" | 251 "punpckhwd %%xmm1,%%xmm1 \n" |
252 "por %%xmm5,%%xmm0 \n" | 252 "por %%xmm5,%%xmm0 \n" |
253 "por %%xmm5,%%xmm1 \n" | 253 "por %%xmm5,%%xmm1 \n" |
254 "movdqu %%xmm0," MEMACCESS(1) " \n" | 254 "movdqu %%xmm0," MEMACCESS(1) " \n" |
255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | 255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
256 "lea " MEMLEA(0x20,1) ",%1 \n" | 256 "lea " MEMLEA(0x20,1) ",%1 \n" |
257 "sub $0x8,%2 \n" | 257 "sub $0x8,%2 \n" |
258 "jg 1b \n" | 258 "jg 1b \n" |
259 : "+r"(src_y), // %0 | 259 : "+r"(src_y), // %0 |
260 "+r"(dst_argb), // %1 | 260 "+r"(dst_argb), // %1 |
261 "+r"(pix) // %2 | 261 "+r"(pix) // %2 |
262 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | 262 :: "memory", "cc", "xmm0", "xmm1", "xmm5" |
263 ); | 263 ); |
264 } | 264 } |
265 #endif // HAS_I400TOARGBROW_SSE2 | 265 #endif // HAS_J400TOARGBROW_SSE2 |
266 | 266 |
267 #ifdef HAS_RGB24TOARGBROW_SSSE3 | 267 #ifdef HAS_RGB24TOARGBROW_SSSE3 |
268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { | 268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
269 asm volatile ( | 269 asm volatile ( |
270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 | 270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 |
271 "pslld $0x18,%%xmm5 \n" | 271 "pslld $0x18,%%xmm5 \n" |
272 "movdqa %3,%%xmm4 \n" | 272 "movdqa %3,%%xmm4 \n" |
273 LABELALIGN | 273 LABELALIGN |
274 "1: \n" | 274 "1: \n" |
275 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 275 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
(...skipping 670 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
946 "m"(kARGBToV), // %6 | 946 "m"(kARGBToV), // %6 |
947 "m"(kARGBToU), // %7 | 947 "m"(kARGBToU), // %7 |
948 "m"(kShufARGBToUV_AVX) // %8 | 948 "m"(kShufARGBToUV_AVX) // %8 |
949 : "memory", "cc", NACL_R14 | 949 : "memory", "cc", NACL_R14 |
950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
951 ); | 951 ); |
952 } | 952 } |
953 #endif // HAS_ARGBTOUVROW_AVX2 | 953 #endif // HAS_ARGBTOUVROW_AVX2 |
954 | 954 |
955 #ifdef HAS_ARGBTOUVJROW_SSSE3 | 955 #ifdef HAS_ARGBTOUVJROW_SSSE3 |
956 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. | |
957 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 956 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
958 uint8* dst_u, uint8* dst_v, int width) { | 957 uint8* dst_u, uint8* dst_v, int width) { |
959 asm volatile ( | 958 asm volatile ( |
960 "movdqa %5,%%xmm3 \n" | 959 "movdqa %5,%%xmm3 \n" |
961 "movdqa %6,%%xmm4 \n" | 960 "movdqa %6,%%xmm4 \n" |
962 "movdqa %7,%%xmm5 \n" | 961 "movdqa %7,%%xmm5 \n" |
963 "sub %1,%2 \n" | 962 "sub %1,%2 \n" |
964 LABELALIGN | 963 LABELALIGN |
965 "1: \n" | 964 "1: \n" |
966 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 965 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
(...skipping 440 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1407 "m"(kRGBAToV), // %5 | 1406 "m"(kRGBAToV), // %5 |
1408 "m"(kRGBAToU), // %6 | 1407 "m"(kRGBAToU), // %6 |
1409 "m"(kAddUV128) // %7 | 1408 "m"(kAddUV128) // %7 |
1410 : "memory", "cc", NACL_R14 | 1409 : "memory", "cc", NACL_R14 |
1411 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 1410 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
1412 ); | 1411 ); |
1413 } | 1412 } |
1414 | 1413 |
1415 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) | 1414 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) |
1416 | 1415 |
1417 // YUV to RGB conversion constants. | |
1418 // Y contribution to R,G,B. Scale and bias. | |
1419 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ | |
1420 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ | |
1421 | |
1422 // U and V contributions to R,G,B. | |
1423 #define UB -128 /* -min(128, round(2.018 * 64)) */ | |
1424 #define UG 25 /* -round(-0.391 * 64) */ | |
1425 #define VG 52 /* -round(-0.813 * 64) */ | |
1426 #define VR -102 /* -round(1.596 * 64) */ | |
1427 | |
1428 // Bias values to subtract 16 from Y and 128 from U and V. | |
1429 #define BB (UB * 128 - YGB) | |
1430 #define BG (UG * 128 + VG * 128 - YGB) | |
1431 #define BR (VR * 128 - YGB) | |
1432 | |
1433 struct YuvConstants { | 1416 struct YuvConstants { |
1434 lvec8 kUVToB; // 0 | 1417 lvec8 kUVToB; // 0 |
1435 lvec8 kUVToG; // 32 | 1418 lvec8 kUVToG; // 32 |
1436 lvec8 kUVToR; // 64 | 1419 lvec8 kUVToR; // 64 |
1437 lvec16 kUVBiasB; // 96 | 1420 lvec16 kUVBiasB; // 96 |
1438 lvec16 kUVBiasG; // 128 | 1421 lvec16 kUVBiasG; // 128 |
1439 lvec16 kUVBiasR; // 160 | 1422 lvec16 kUVBiasR; // 160 |
1440 lvec16 kYToRgb; // 192 | 1423 lvec16 kYToRgb; // 192 |
1441 }; | 1424 }; |
1442 | 1425 |
| 1426 // BT.601 YUV to RGB reference |
| 1427 // R = (Y - 16) * 1.164 - V * -1.596 |
| 1428 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 |
| 1429 // B = (Y - 16) * 1.164 - U * -2.018 |
| 1430 |
| 1431 // Y contribution to R,G,B. Scale and bias. |
| 1432 // TODO(fbarchard): Consider moving constants into a common header. |
| 1433 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ |
| 1434 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ |
| 1435 |
| 1436 // U and V contributions to R,G,B. |
| 1437 #define UB -128 /* max(-128, round(-2.018 * 64)) */ |
| 1438 #define UG 25 /* round(0.391 * 64) */ |
| 1439 #define VG 52 /* round(0.813 * 64) */ |
| 1440 #define VR -102 /* round(-1.596 * 64) */ |
| 1441 |
| 1442 // Bias values to subtract 16 from Y and 128 from U and V. |
| 1443 #define BB (UB * 128 + YGB) |
| 1444 #define BG (UG * 128 + VG * 128 + YGB) |
| 1445 #define BR (VR * 128 + YGB) |
| 1446 |
1443 // BT601 constants for YUV to RGB. | 1447 // BT601 constants for YUV to RGB. |
1444 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { | 1448 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { |
1445 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, | 1449 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, |
1446 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, | 1450 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, |
1447 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, | 1451 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, |
1448 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, | 1452 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, |
1449 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, | 1453 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, |
1450 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, | 1454 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, |
1451 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | 1455 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, |
1452 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | 1456 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, |
1453 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | 1457 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, |
1454 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | 1458 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } |
1455 }; | 1459 }; |
1456 | 1460 |
1457 // BT601 constants for NV21 where chroma plane is VU instead of UV. | 1461 // BT601 constants for NV21 where chroma plane is VU instead of UV. |
1458 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { | 1462 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { |
1459 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, | 1463 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, |
1460 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, | 1464 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, |
1461 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, | 1465 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, |
1462 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, | 1466 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, |
1463 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, | 1467 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, |
1464 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, | 1468 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, |
1465 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, | 1469 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, |
1466 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, | 1470 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, |
1467 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, | 1471 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, |
1468 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } | 1472 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } |
1469 }; | 1473 }; |
1470 | 1474 |
| 1475 #undef YG |
| 1476 #undef YGB |
| 1477 #undef UB |
| 1478 #undef UG |
| 1479 #undef VG |
| 1480 #undef VR |
| 1481 #undef BB |
| 1482 #undef BG |
| 1483 #undef BR |
| 1484 |
| 1485 // JPEG YUV to RGB reference |
| 1486 // * R = Y - V * -1.40200 |
| 1487 // * G = Y - U * 0.34414 - V * 0.71414 |
| 1488 // * B = Y - U * -1.77200 |
| 1489 |
| 1490 // Y contribution to R,G,B. Scale and bias. |
| 1491 // TODO(fbarchard): Consider moving constants into a common header. |
| 1492 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ |
| 1493 #define YGBJ 32 /* 64 / 2 */ |
| 1494 |
| 1495 // U and V contributions to R,G,B. |
| 1496 #define UBJ -113 /* round(-1.77200 * 64) */ |
| 1497 #define UGJ 22 /* round(0.34414 * 64) */ |
| 1498 #define VGJ 46 /* round(0.71414 * 64) */ |
| 1499 #define VRJ -90 /* round(-1.40200 * 64) */ |
| 1500 |
| 1501 // Bias values to subtract 16 from Y and 128 from U and V. |
| 1502 #define BBJ (UBJ * 128 + YGBJ) |
| 1503 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) |
| 1504 #define BRJ (VRJ * 128 + YGBJ) |
| 1505 |
| 1506 // JPEG constants for YUV to RGB. |
| 1507 YuvConstants SIMD_ALIGNED(kYuvJConstants) = { |
| 1508 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, |
| 1509 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, |
| 1510 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| 1511 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| 1512 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, |
| 1513 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, |
| 1514 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, |
| 1515 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, |
| 1516 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, |
| 1517 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, |
| 1518 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, |
| 1519 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, |
| 1520 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, |
| 1521 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, |
| 1522 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, |
| 1523 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } |
| 1524 }; |
| 1525 |
| 1526 #undef YGJ |
| 1527 #undef YGBJ |
| 1528 #undef UBJ |
| 1529 #undef UGJ |
| 1530 #undef VGJ |
| 1531 #undef VRJ |
| 1532 #undef BBJ |
| 1533 #undef BGJ |
| 1534 #undef BRJ |
| 1535 |
1471 // Read 8 UV from 411 | 1536 // Read 8 UV from 411 |
1472 #define READYUV444 \ | 1537 #define READYUV444 \ |
1473 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1538 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1474 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1539 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1475 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1540 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
1476 "punpcklbw %%xmm1,%%xmm0 \n" | 1541 "punpcklbw %%xmm1,%%xmm0 \n" |
1477 | 1542 |
1478 // Read 4 UV from 422, upsample to 8 UV | 1543 // Read 4 UV from 422, upsample to 8 UV |
1479 #define READYUV422 \ | 1544 #define READYUV422 \ |
1480 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1545 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1527 "packuswb %%xmm2,%%xmm2 \n" | 1592 "packuswb %%xmm2,%%xmm2 \n" |
1528 | 1593 |
1529 // Store 8 ARGB values. Assumes XMM5 is zero. | 1594 // Store 8 ARGB values. Assumes XMM5 is zero. |
1530 #define STOREARGB \ | 1595 #define STOREARGB \ |
1531 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1596 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1532 "punpcklbw %%xmm5,%%xmm2 \n" \ | 1597 "punpcklbw %%xmm5,%%xmm2 \n" \ |
1533 "movdqa %%xmm0,%%xmm1 \n" \ | 1598 "movdqa %%xmm0,%%xmm1 \n" \ |
1534 "punpcklwd %%xmm2,%%xmm0 \n" \ | 1599 "punpcklwd %%xmm2,%%xmm0 \n" \ |
1535 "punpckhwd %%xmm2,%%xmm1 \n" \ | 1600 "punpckhwd %%xmm2,%%xmm1 \n" \ |
1536 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ | 1601 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ |
1537 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \ | 1602 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ |
1538 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" | 1603 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" |
1539 | 1604 |
1540 // Store 8 BGRA values. Assumes XMM5 is zero. | 1605 // Store 8 BGRA values. Assumes XMM5 is zero. |
1541 #define STOREBGRA \ | 1606 #define STOREBGRA \ |
1542 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1607 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
1543 "punpcklbw %%xmm0,%%xmm1 \n" \ | 1608 "punpcklbw %%xmm0,%%xmm1 \n" \ |
1544 "punpcklbw %%xmm2,%%xmm5 \n" \ | 1609 "punpcklbw %%xmm2,%%xmm5 \n" \ |
1545 "movdqa %%xmm5,%%xmm0 \n" \ | 1610 "movdqa %%xmm5,%%xmm0 \n" \ |
1546 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1611 "punpcklwd %%xmm1,%%xmm5 \n" \ |
1547 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1612 "punpckhwd %%xmm1,%%xmm0 \n" \ |
1548 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ | 1613 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ |
1549 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \ | 1614 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ |
1550 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" | 1615 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" |
1551 | 1616 |
1552 // Store 8 ABGR values. Assumes XMM5 is zero. | 1617 // Store 8 ABGR values. Assumes XMM5 is zero. |
1553 #define STOREABGR \ | 1618 #define STOREABGR \ |
1554 "punpcklbw %%xmm1,%%xmm2 \n" \ | 1619 "punpcklbw %%xmm1,%%xmm2 \n" \ |
1555 "punpcklbw %%xmm5,%%xmm0 \n" \ | 1620 "punpcklbw %%xmm5,%%xmm0 \n" \ |
1556 "movdqa %%xmm2,%%xmm1 \n" \ | 1621 "movdqa %%xmm2,%%xmm1 \n" \ |
1557 "punpcklwd %%xmm0,%%xmm2 \n" \ | 1622 "punpcklwd %%xmm0,%%xmm2 \n" \ |
1558 "punpckhwd %%xmm0,%%xmm1 \n" \ | 1623 "punpckhwd %%xmm0,%%xmm1 \n" \ |
1559 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ | 1624 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ |
1560 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \ | 1625 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ |
1561 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" | 1626 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" |
1562 | 1627 |
1563 // Store 8 RGBA values. Assumes XMM5 is zero. | 1628 // Store 8 RGBA values. Assumes XMM5 is zero. |
1564 #define STORERGBA \ | 1629 #define STORERGBA \ |
1565 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1630 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
1566 "punpcklbw %%xmm2,%%xmm1 \n" \ | 1631 "punpcklbw %%xmm2,%%xmm1 \n" \ |
1567 "punpcklbw %%xmm0,%%xmm5 \n" \ | 1632 "punpcklbw %%xmm0,%%xmm5 \n" \ |
1568 "movdqa %%xmm5,%%xmm0 \n" \ | 1633 "movdqa %%xmm5,%%xmm0 \n" \ |
1569 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1634 "punpcklwd %%xmm1,%%xmm5 \n" \ |
1570 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1635 "punpckhwd %%xmm1,%%xmm0 \n" \ |
1571 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ | 1636 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ |
1572 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \ | 1637 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ |
1573 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" | 1638 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" |
1574 | 1639 |
1575 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, | 1640 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, |
1576 const uint8* u_buf, | 1641 const uint8* u_buf, |
1577 const uint8* v_buf, | 1642 const uint8* v_buf, |
1578 uint8* dst_argb, | 1643 uint8* dst_argb, |
1579 int width) { | 1644 int width) { |
1580 asm volatile ( | 1645 asm volatile ( |
1581 "sub %[u_buf],%[v_buf] \n" | 1646 "sub %[u_buf],%[v_buf] \n" |
1582 "pcmpeqb %%xmm5,%%xmm5 \n" | 1647 "pcmpeqb %%xmm5,%%xmm5 \n" |
1583 LABELALIGN | 1648 LABELALIGN |
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1706 [u_buf]"+r"(u_buf), // %[u_buf] | 1771 [u_buf]"+r"(u_buf), // %[u_buf] |
1707 [v_buf]"+r"(v_buf), // %[v_buf] | 1772 [v_buf]"+r"(v_buf), // %[v_buf] |
1708 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1773 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1709 [width]"+rm"(width) // %[width] | 1774 [width]"+rm"(width) // %[width] |
1710 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 1775 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
1711 : "memory", "cc", NACL_R14 | 1776 : "memory", "cc", NACL_R14 |
1712 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 1777 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1713 ); | 1778 ); |
1714 } | 1779 } |
1715 | 1780 |
| 1781 void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf, |
| 1782 const uint8* u_buf, |
| 1783 const uint8* v_buf, |
| 1784 uint8* dst_argb, |
| 1785 int width) { |
| 1786 asm volatile ( |
| 1787 "sub %[u_buf],%[v_buf] \n" |
| 1788 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1789 LABELALIGN |
| 1790 "1: \n" |
| 1791 READYUV422 |
| 1792 YUVTORGB(kYuvConstants) |
| 1793 STOREARGB |
| 1794 "sub $0x8,%[width] \n" |
| 1795 "jg 1b \n" |
| 1796 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1797 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1798 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1799 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1800 [width]"+rm"(width) // %[width] |
| 1801 : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] |
| 1802 : "memory", "cc", NACL_R14 |
| 1803 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 1804 ); |
| 1805 } |
| 1806 |
1716 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | 1807 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
1717 const uint8* u_buf, | 1808 const uint8* u_buf, |
1718 const uint8* v_buf, | 1809 const uint8* v_buf, |
1719 uint8* dst_argb, | 1810 uint8* dst_argb, |
1720 int width) { | 1811 int width) { |
1721 asm volatile ( | 1812 asm volatile ( |
1722 "sub %[u_buf],%[v_buf] \n" | 1813 "sub %[u_buf],%[v_buf] \n" |
1723 "pcmpeqb %%xmm5,%%xmm5 \n" | 1814 "pcmpeqb %%xmm5,%%xmm5 \n" |
1724 LABELALIGN | 1815 LABELALIGN |
1725 "1: \n" | 1816 "1: \n" |
(...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1874 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1965 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1875 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" | 1966 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" |
1876 | 1967 |
1877 // Convert 16 pixels: 16 UV and 16 Y. | 1968 // Convert 16 pixels: 16 UV and 16 Y. |
1878 #define YUVTORGB_AVX2(YuvConstants) \ | 1969 #define YUVTORGB_AVX2(YuvConstants) \ |
1879 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ | 1970 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ |
1880 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ | 1971 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ |
1881 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ | 1972 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ |
1882 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ | 1973 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ |
1883 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ | 1974 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ |
1884 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \ | 1975 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ |
1885 "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \ | 1976 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ |
1886 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \ | 1977 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ |
1887 "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \ | 1978 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ |
1888 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ | 1979 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ |
1889 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ | 1980 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ |
1890 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ | 1981 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ |
1891 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ | 1982 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ |
1892 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ | 1983 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ |
1893 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ | 1984 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ |
1894 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ | 1985 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ |
1895 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ | 1986 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ |
1896 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 1987 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
1897 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 1988 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1977 [v_buf]"+r"(v_buf), // %[v_buf] | 2068 [v_buf]"+r"(v_buf), // %[v_buf] |
1978 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2069 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1979 [width]"+rm"(width) // %[width] | 2070 [width]"+rm"(width) // %[width] |
1980 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 2071 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
1981 : "memory", "cc", NACL_R14 | 2072 : "memory", "cc", NACL_R14 |
1982 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 2073 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
1983 ); | 2074 ); |
1984 } | 2075 } |
1985 #endif // HAS_I422TOARGBROW_AVX2 | 2076 #endif // HAS_I422TOARGBROW_AVX2 |
1986 | 2077 |
| 2078 #if defined(HAS_J422TOARGBROW_AVX2) |
| 2079 // 16 pixels |
| 2080 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2081 void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf, |
| 2082 const uint8* u_buf, |
| 2083 const uint8* v_buf, |
| 2084 uint8* dst_argb, |
| 2085 int width) { |
| 2086 asm volatile ( |
| 2087 "sub %[u_buf],%[v_buf] \n" |
| 2088 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2089 LABELALIGN |
| 2090 "1: \n" |
| 2091 READYUV422_AVX2 |
| 2092 YUVTORGB_AVX2(kYuvConstants) |
| 2093 |
| 2094 // Step 3: Weave into ARGB |
| 2095 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG |
| 2096 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 2097 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA |
| 2098 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
| 2099 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels |
| 2100 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels |
| 2101 |
| 2102 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" |
| 2103 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" |
| 2104 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
| 2105 "sub $0x10,%[width] \n" |
| 2106 "jg 1b \n" |
| 2107 "vzeroupper \n" |
| 2108 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2109 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2110 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2111 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2112 [width]"+rm"(width) // %[width] |
| 2113 : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] |
| 2114 : "memory", "cc", NACL_R14 |
| 2115 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 2116 ); |
| 2117 } |
| 2118 #endif // HAS_J422TOARGBROW_AVX2 |
| 2119 |
1987 #if defined(HAS_I422TOABGRROW_AVX2) | 2120 #if defined(HAS_I422TOABGRROW_AVX2) |
1988 // 16 pixels | 2121 // 16 pixels |
1989 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
1990 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, | 2123 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, |
1991 const uint8* u_buf, | 2124 const uint8* u_buf, |
1992 const uint8* v_buf, | 2125 const uint8* v_buf, |
1993 uint8* dst_argb, | 2126 uint8* dst_argb, |
1994 int width) { | 2127 int width) { |
1995 asm volatile ( | 2128 asm volatile ( |
1996 "sub %[u_buf],%[v_buf] \n" | 2129 "sub %[u_buf],%[v_buf] \n" |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2059 [v_buf]"+r"(v_buf), // %[v_buf] | 2192 [v_buf]"+r"(v_buf), // %[v_buf] |
2060 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2193 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2061 [width]"+rm"(width) // %[width] | 2194 [width]"+rm"(width) // %[width] |
2062 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] | 2195 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
2063 : "memory", "cc", NACL_R14 | 2196 : "memory", "cc", NACL_R14 |
2064 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 2197 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
2065 ); | 2198 ); |
2066 } | 2199 } |
2067 #endif // HAS_I422TORGBAROW_AVX2 | 2200 #endif // HAS_I422TORGBAROW_AVX2 |
2068 | 2201 |
2069 #ifdef HAS_YTOARGBROW_SSE2 | 2202 #ifdef HAS_I400TOARGBROW_SSE2 |
2070 void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { | 2203 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
2071 asm volatile ( | 2204 asm volatile ( |
2072 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 | 2205 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 |
2073 "movd %%eax,%%xmm2 \n" | 2206 "movd %%eax,%%xmm2 \n" |
2074 "pshufd $0x0,%%xmm2,%%xmm2 \n" | 2207 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
2075 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 | 2208 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 |
2076 "movd %%eax,%%xmm3 \n" | 2209 "movd %%eax,%%xmm3 \n" |
2077 "pshufd $0x0,%%xmm3,%%xmm3 \n" | 2210 "pshufd $0x0,%%xmm3,%%xmm3 \n" |
2078 "pcmpeqb %%xmm4,%%xmm4 \n" | 2211 "pcmpeqb %%xmm4,%%xmm4 \n" |
2079 "pslld $0x18,%%xmm4 \n" | 2212 "pslld $0x18,%%xmm4 \n" |
2080 LABELALIGN | 2213 LABELALIGN |
(...skipping 21 matching lines...) Expand all Loading... |
2102 "sub $0x8,%2 \n" | 2235 "sub $0x8,%2 \n" |
2103 "jg 1b \n" | 2236 "jg 1b \n" |
2104 : "+r"(y_buf), // %0 | 2237 : "+r"(y_buf), // %0 |
2105 "+r"(dst_argb), // %1 | 2238 "+r"(dst_argb), // %1 |
2106 "+rm"(width) // %2 | 2239 "+rm"(width) // %2 |
2107 : | 2240 : |
2108 : "memory", "cc", "eax" | 2241 : "memory", "cc", "eax" |
2109 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | 2242 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
2110 ); | 2243 ); |
2111 } | 2244 } |
2112 #endif // HAS_YTOARGBROW_SSE2 | 2245 #endif // HAS_I400TOARGBROW_SSE2 |
2113 | 2246 |
2114 #ifdef HAS_YTOARGBROW_AVX2 | 2247 #ifdef HAS_I400TOARGBROW_AVX2 |
2115 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). | 2248 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). |
2116 // note: vpunpcklbw mutates and vpackuswb unmutates. | 2249 // note: vpunpcklbw mutates and vpackuswb unmutates. |
2117 void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { | 2250 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { |
2118 asm volatile ( | 2251 asm volatile ( |
2119 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 | 2252 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 |
2120 "vmovd %%eax,%%xmm2 \n" | 2253 "vmovd %%eax,%%xmm2 \n" |
2121 "vbroadcastss %%xmm2,%%ymm2 \n" | 2254 "vbroadcastss %%xmm2,%%ymm2 \n" |
2122 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 | 2255 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 |
2123 "vmovd %%eax,%%xmm3 \n" | 2256 "vmovd %%eax,%%xmm3 \n" |
2124 "vbroadcastss %%xmm3,%%ymm3 \n" | 2257 "vbroadcastss %%xmm3,%%ymm3 \n" |
2125 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" | 2258 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" |
2126 "vpslld $0x18,%%ymm4,%%ymm4 \n" | 2259 "vpslld $0x18,%%ymm4,%%ymm4 \n" |
2127 | 2260 |
(...skipping 21 matching lines...) Expand all Loading... |
2149 "jg 1b \n" | 2282 "jg 1b \n" |
2150 "vzeroupper \n" | 2283 "vzeroupper \n" |
2151 : "+r"(y_buf), // %0 | 2284 : "+r"(y_buf), // %0 |
2152 "+r"(dst_argb), // %1 | 2285 "+r"(dst_argb), // %1 |
2153 "+rm"(width) // %2 | 2286 "+rm"(width) // %2 |
2154 : | 2287 : |
2155 : "memory", "cc", "eax" | 2288 : "memory", "cc", "eax" |
2156 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | 2289 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
2157 ); | 2290 ); |
2158 } | 2291 } |
2159 #endif // HAS_YTOARGBROW_AVX2 | 2292 #endif // HAS_I400TOARGBROW_AVX2 |
2160 | 2293 |
2161 #ifdef HAS_MIRRORROW_SSSE3 | 2294 #ifdef HAS_MIRRORROW_SSSE3 |
2162 // Shuffle table for reversing the bytes. | 2295 // Shuffle table for reversing the bytes. |
2163 static uvec8 kShuffleMirror = { | 2296 static uvec8 kShuffleMirror = { |
2164 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | 2297 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
2165 }; | 2298 }; |
2166 | 2299 |
2167 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { | 2300 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
2168 intptr_t temp_width = (intptr_t)(width); | 2301 intptr_t temp_width = (intptr_t)(width); |
2169 asm volatile ( | 2302 asm volatile ( |
(...skipping 919 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3089 uint8* dst_argb, int width) { | 3222 uint8* dst_argb, int width) { |
3090 asm volatile ( | 3223 asm volatile ( |
3091 "pcmpeqb %%xmm7,%%xmm7 \n" | 3224 "pcmpeqb %%xmm7,%%xmm7 \n" |
3092 "psrlw $0xf,%%xmm7 \n" | 3225 "psrlw $0xf,%%xmm7 \n" |
3093 "pcmpeqb %%xmm6,%%xmm6 \n" | 3226 "pcmpeqb %%xmm6,%%xmm6 \n" |
3094 "psrlw $0x8,%%xmm6 \n" | 3227 "psrlw $0x8,%%xmm6 \n" |
3095 "pcmpeqb %%xmm5,%%xmm5 \n" | 3228 "pcmpeqb %%xmm5,%%xmm5 \n" |
3096 "psllw $0x8,%%xmm5 \n" | 3229 "psllw $0x8,%%xmm5 \n" |
3097 "pcmpeqb %%xmm4,%%xmm4 \n" | 3230 "pcmpeqb %%xmm4,%%xmm4 \n" |
3098 "pslld $0x18,%%xmm4 \n" | 3231 "pslld $0x18,%%xmm4 \n" |
3099 "sub $0x1,%3 \n" | 3232 "sub $0x4,%3 \n" |
3100 "je 91f \n" | |
3101 "jl 99f \n" | |
3102 | |
3103 // 1 pixel loop until destination pointer is aligned. | |
3104 "10: \n" | |
3105 "test $0xf,%2 \n" | |
3106 "je 19f \n" | |
3107 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
3108 "lea " MEMLEA(0x4,0) ",%0 \n" | |
3109 "movdqa %%xmm3,%%xmm0 \n" | |
3110 "pxor %%xmm4,%%xmm3 \n" | |
3111 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
3112 "psrlw $0x8,%%xmm3 \n" | |
3113 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
3114 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
3115 "pand %%xmm6,%%xmm2 \n" | |
3116 "paddw %%xmm7,%%xmm3 \n" | |
3117 "pmullw %%xmm3,%%xmm2 \n" | |
3118 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
3119 "lea " MEMLEA(0x4,1) ",%1 \n" | |
3120 "psrlw $0x8,%%xmm1 \n" | |
3121 "por %%xmm4,%%xmm0 \n" | |
3122 "pmullw %%xmm3,%%xmm1 \n" | |
3123 "psrlw $0x8,%%xmm2 \n" | |
3124 "paddusb %%xmm2,%%xmm0 \n" | |
3125 "pand %%xmm5,%%xmm1 \n" | |
3126 "paddusb %%xmm1,%%xmm0 \n" | |
3127 "movd %%xmm0," MEMACCESS(2) " \n" | |
3128 "lea " MEMLEA(0x4,2) ",%2 \n" | |
3129 "sub $0x1,%3 \n" | |
3130 "jge 10b \n" | |
3131 | |
3132 "19: \n" | |
3133 "add $1-4,%3 \n" | |
3134 "jl 49f \n" | 3233 "jl 49f \n" |
3135 | 3234 |
3136 // 4 pixel loop. | 3235 // 4 pixel loop. |
3137 LABELALIGN | 3236 LABELALIGN |
3138 "41: \n" | 3237 "41: \n" |
3139 "movdqu " MEMACCESS(0) ",%%xmm3 \n" | 3238 "movdqu " MEMACCESS(0) ",%%xmm3 \n" |
3140 "lea " MEMLEA(0x10,0) ",%0 \n" | 3239 "lea " MEMLEA(0x10,0) ",%0 \n" |
3141 "movdqa %%xmm3,%%xmm0 \n" | 3240 "movdqa %%xmm3,%%xmm0 \n" |
3142 "pxor %%xmm4,%%xmm3 \n" | 3241 "pxor %%xmm4,%%xmm3 \n" |
3143 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | 3242 "movdqu " MEMACCESS(1) ",%%xmm2 \n" |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3224 uint8* dst_argb, int width) { | 3323 uint8* dst_argb, int width) { |
3225 asm volatile ( | 3324 asm volatile ( |
3226 "pcmpeqb %%xmm7,%%xmm7 \n" | 3325 "pcmpeqb %%xmm7,%%xmm7 \n" |
3227 "psrlw $0xf,%%xmm7 \n" | 3326 "psrlw $0xf,%%xmm7 \n" |
3228 "pcmpeqb %%xmm6,%%xmm6 \n" | 3327 "pcmpeqb %%xmm6,%%xmm6 \n" |
3229 "psrlw $0x8,%%xmm6 \n" | 3328 "psrlw $0x8,%%xmm6 \n" |
3230 "pcmpeqb %%xmm5,%%xmm5 \n" | 3329 "pcmpeqb %%xmm5,%%xmm5 \n" |
3231 "psllw $0x8,%%xmm5 \n" | 3330 "psllw $0x8,%%xmm5 \n" |
3232 "pcmpeqb %%xmm4,%%xmm4 \n" | 3331 "pcmpeqb %%xmm4,%%xmm4 \n" |
3233 "pslld $0x18,%%xmm4 \n" | 3332 "pslld $0x18,%%xmm4 \n" |
3234 "sub $0x1,%3 \n" | 3333 "sub $0x4,%3 \n" |
3235 "je 91f \n" | |
3236 "jl 99f \n" | |
3237 | |
3238 // 1 pixel loop until destination pointer is aligned. | |
3239 "10: \n" | |
3240 "test $0xf,%2 \n" | |
3241 "je 19f \n" | |
3242 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
3243 "lea " MEMLEA(0x4,0) ",%0 \n" | |
3244 "movdqa %%xmm3,%%xmm0 \n" | |
3245 "pxor %%xmm4,%%xmm3 \n" | |
3246 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
3247 "pshufb %4,%%xmm3 \n" | |
3248 "pand %%xmm6,%%xmm2 \n" | |
3249 "paddw %%xmm7,%%xmm3 \n" | |
3250 "pmullw %%xmm3,%%xmm2 \n" | |
3251 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
3252 "lea " MEMLEA(0x4,1) ",%1 \n" | |
3253 "psrlw $0x8,%%xmm1 \n" | |
3254 "por %%xmm4,%%xmm0 \n" | |
3255 "pmullw %%xmm3,%%xmm1 \n" | |
3256 "psrlw $0x8,%%xmm2 \n" | |
3257 "paddusb %%xmm2,%%xmm0 \n" | |
3258 "pand %%xmm5,%%xmm1 \n" | |
3259 "paddusb %%xmm1,%%xmm0 \n" | |
3260 "movd %%xmm0," MEMACCESS(2) " \n" | |
3261 "lea " MEMLEA(0x4,2) ",%2 \n" | |
3262 "sub $0x1,%3 \n" | |
3263 "jge 10b \n" | |
3264 | |
3265 "19: \n" | |
3266 "add $1-4,%3 \n" | |
3267 "jl 49f \n" | 3334 "jl 49f \n" |
3268 | 3335 |
3269 // 4 pixel loop. | 3336 // 4 pixel loop. |
3270 LABELALIGN | 3337 LABELALIGN |
3271 "40: \n" | 3338 "40: \n" |
3272 "movdqu " MEMACCESS(0) ",%%xmm3 \n" | 3339 "movdqu " MEMACCESS(0) ",%%xmm3 \n" |
3273 "lea " MEMLEA(0x10,0) ",%0 \n" | 3340 "lea " MEMLEA(0x10,0) ",%0 \n" |
3274 "movdqa %%xmm3,%%xmm0 \n" | 3341 "movdqa %%xmm3,%%xmm0 \n" |
3275 "pxor %%xmm4,%%xmm3 \n" | 3342 "pxor %%xmm4,%%xmm3 \n" |
3276 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | 3343 "movdqu " MEMACCESS(1) ",%%xmm2 \n" |
(...skipping 1613 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4890 "+r"(src_ptr), // %1 | 4957 "+r"(src_ptr), // %1 |
4891 "+r"(dst_width), // %2 | 4958 "+r"(dst_width), // %2 |
4892 "+r"(source_y_fraction) // %3 | 4959 "+r"(source_y_fraction) // %3 |
4893 : "r"((intptr_t)(src_stride)) // %4 | 4960 : "r"((intptr_t)(src_stride)) // %4 |
4894 : "memory", "cc", NACL_R14 | 4961 : "memory", "cc", NACL_R14 |
4895 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 4962 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
4896 ); | 4963 ); |
4897 } | 4964 } |
4898 #endif // HAS_INTERPOLATEROW_SSE2 | 4965 #endif // HAS_INTERPOLATEROW_SSE2 |
4899 | 4966 |
4900 #ifdef HAS_ARGBTOBAYERGGROW_SSE2 | |
4901 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, | |
4902 uint32 selector, int pix) { | |
4903 asm volatile ( | |
4904 "pcmpeqb %%xmm5,%%xmm5 \n" | |
4905 "psrld $0x18,%%xmm5 \n" | |
4906 LABELALIGN | |
4907 "1: \n" | |
4908 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
4909 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | |
4910 "lea " MEMLEA(0x20,0) ",%0 \n" | |
4911 "psrld $0x8,%%xmm0 \n" | |
4912 "psrld $0x8,%%xmm1 \n" | |
4913 "pand %%xmm5,%%xmm0 \n" | |
4914 "pand %%xmm5,%%xmm1 \n" | |
4915 "packssdw %%xmm1,%%xmm0 \n" | |
4916 "packuswb %%xmm1,%%xmm0 \n" | |
4917 "movq %%xmm0," MEMACCESS(1) " \n" | |
4918 "lea " MEMLEA(0x8,1) ",%1 \n" | |
4919 "sub $0x8,%2 \n" | |
4920 "jg 1b \n" | |
4921 : "+r"(src_argb), // %0 | |
4922 "+r"(dst_bayer), // %1 | |
4923 "+r"(pix) // %2 | |
4924 : | |
4925 : "memory", "cc" | |
4926 , "xmm0", "xmm1", "xmm5" | |
4927 ); | |
4928 } | |
4929 #endif // HAS_ARGBTOBAYERGGROW_SSE2 | |
4930 | |
4931 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 | 4967 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 |
4932 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 4968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
4933 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 4969 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
4934 const uint8* shuffler, int pix) { | 4970 const uint8* shuffler, int pix) { |
4935 asm volatile ( | 4971 asm volatile ( |
4936 "movdqu " MEMACCESS(3) ",%%xmm5 \n" | 4972 "movdqu " MEMACCESS(3) ",%%xmm5 \n" |
4937 LABELALIGN | 4973 LABELALIGN |
4938 "1: \n" | 4974 "1: \n" |
4939 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4975 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
4940 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 4976 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
(...skipping 489 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5430 ); | 5466 ); |
5431 } | 5467 } |
5432 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5468 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5433 | 5469 |
5434 #endif // defined(__x86_64__) || defined(__i386__) | 5470 #endif // defined(__x86_64__) || defined(__i386__) |
5435 | 5471 |
5436 #ifdef __cplusplus | 5472 #ifdef __cplusplus |
5437 } // extern "C" | 5473 } // extern "C" |
5438 } // namespace libyuv | 5474 } // namespace libyuv |
5439 #endif | 5475 #endif |
OLD | NEW |