Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(24)

Side by Side Diff: source/libvpx/third_party/libyuv/source/row_gcc.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after
229 "jg 1b \n" 229 "jg 1b \n"
230 : "+r"(src_y), // %0 230 : "+r"(src_y), // %0
231 "+r"(dst_argb), // %1 231 "+r"(dst_argb), // %1
232 "+r"(pix) // %2 232 "+r"(pix) // %2
233 : 233 :
234 : "memory", "cc", "xmm0", "xmm1", "xmm5" 234 : "memory", "cc", "xmm0", "xmm1", "xmm5"
235 ); 235 );
236 } 236 }
237 #endif // TESTING 237 #endif // TESTING
238 238
239 #ifdef HAS_I400TOARGBROW_SSE2 239 #ifdef HAS_J400TOARGBROW_SSE2
240 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 240 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
241 asm volatile ( 241 asm volatile (
242 "pcmpeqb %%xmm5,%%xmm5 \n" 242 "pcmpeqb %%xmm5,%%xmm5 \n"
243 "pslld $0x18,%%xmm5 \n" 243 "pslld $0x18,%%xmm5 \n"
244 LABELALIGN 244 LABELALIGN
245 "1: \n" 245 "1: \n"
246 "movq " MEMACCESS(0) ",%%xmm0 \n" 246 "movq " MEMACCESS(0) ",%%xmm0 \n"
247 "lea " MEMLEA(0x8,0) ",%0 \n" 247 "lea " MEMLEA(0x8,0) ",%0 \n"
248 "punpcklbw %%xmm0,%%xmm0 \n" 248 "punpcklbw %%xmm0,%%xmm0 \n"
249 "movdqa %%xmm0,%%xmm1 \n" 249 "movdqa %%xmm0,%%xmm1 \n"
250 "punpcklwd %%xmm0,%%xmm0 \n" 250 "punpcklwd %%xmm0,%%xmm0 \n"
251 "punpckhwd %%xmm1,%%xmm1 \n" 251 "punpckhwd %%xmm1,%%xmm1 \n"
252 "por %%xmm5,%%xmm0 \n" 252 "por %%xmm5,%%xmm0 \n"
253 "por %%xmm5,%%xmm1 \n" 253 "por %%xmm5,%%xmm1 \n"
254 "movdqu %%xmm0," MEMACCESS(1) " \n" 254 "movdqu %%xmm0," MEMACCESS(1) " \n"
255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
256 "lea " MEMLEA(0x20,1) ",%1 \n" 256 "lea " MEMLEA(0x20,1) ",%1 \n"
257 "sub $0x8,%2 \n" 257 "sub $0x8,%2 \n"
258 "jg 1b \n" 258 "jg 1b \n"
259 : "+r"(src_y), // %0 259 : "+r"(src_y), // %0
260 "+r"(dst_argb), // %1 260 "+r"(dst_argb), // %1
261 "+r"(pix) // %2 261 "+r"(pix) // %2
262 :: "memory", "cc", "xmm0", "xmm1", "xmm5" 262 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
263 ); 263 );
264 } 264 }
265 #endif // HAS_I400TOARGBROW_SSE2 265 #endif // HAS_J400TOARGBROW_SSE2
266 266
267 #ifdef HAS_RGB24TOARGBROW_SSSE3 267 #ifdef HAS_RGB24TOARGBROW_SSSE3
268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
269 asm volatile ( 269 asm volatile (
270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
271 "pslld $0x18,%%xmm5 \n" 271 "pslld $0x18,%%xmm5 \n"
272 "movdqa %3,%%xmm4 \n" 272 "movdqa %3,%%xmm4 \n"
273 LABELALIGN 273 LABELALIGN
274 "1: \n" 274 "1: \n"
275 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 275 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
(...skipping 670 matching lines...) Expand 10 before | Expand all | Expand 10 after
946 "m"(kARGBToV), // %6 946 "m"(kARGBToV), // %6
947 "m"(kARGBToU), // %7 947 "m"(kARGBToU), // %7
948 "m"(kShufARGBToUV_AVX) // %8 948 "m"(kShufARGBToUV_AVX) // %8
949 : "memory", "cc", NACL_R14 949 : "memory", "cc", NACL_R14
950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
951 ); 951 );
952 } 952 }
953 #endif // HAS_ARGBTOUVROW_AVX2 953 #endif // HAS_ARGBTOUVROW_AVX2
954 954
955 #ifdef HAS_ARGBTOUVJROW_SSSE3 955 #ifdef HAS_ARGBTOUVJROW_SSSE3
956 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
957 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 956 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
958 uint8* dst_u, uint8* dst_v, int width) { 957 uint8* dst_u, uint8* dst_v, int width) {
959 asm volatile ( 958 asm volatile (
960 "movdqa %5,%%xmm3 \n" 959 "movdqa %5,%%xmm3 \n"
961 "movdqa %6,%%xmm4 \n" 960 "movdqa %6,%%xmm4 \n"
962 "movdqa %7,%%xmm5 \n" 961 "movdqa %7,%%xmm5 \n"
963 "sub %1,%2 \n" 962 "sub %1,%2 \n"
964 LABELALIGN 963 LABELALIGN
965 "1: \n" 964 "1: \n"
966 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 965 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
(...skipping 440 matching lines...) Expand 10 before | Expand all | Expand 10 after
1407 "m"(kRGBAToV), // %5 1406 "m"(kRGBAToV), // %5
1408 "m"(kRGBAToU), // %6 1407 "m"(kRGBAToU), // %6
1409 "m"(kAddUV128) // %7 1408 "m"(kAddUV128) // %7
1410 : "memory", "cc", NACL_R14 1409 : "memory", "cc", NACL_R14
1411 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1410 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1412 ); 1411 );
1413 } 1412 }
1414 1413
1415 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) 1414 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1416 1415
1417 // YUV to RGB conversion constants.
1418 // Y contribution to R,G,B. Scale and bias.
1419 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1420 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
1421
1422 // U and V contributions to R,G,B.
1423 #define UB -128 /* -min(128, round(2.018 * 64)) */
1424 #define UG 25 /* -round(-0.391 * 64) */
1425 #define VG 52 /* -round(-0.813 * 64) */
1426 #define VR -102 /* -round(1.596 * 64) */
1427
1428 // Bias values to subtract 16 from Y and 128 from U and V.
1429 #define BB (UB * 128 - YGB)
1430 #define BG (UG * 128 + VG * 128 - YGB)
1431 #define BR (VR * 128 - YGB)
1432
1433 struct YuvConstants { 1416 struct YuvConstants {
1434 lvec8 kUVToB; // 0 1417 lvec8 kUVToB; // 0
1435 lvec8 kUVToG; // 32 1418 lvec8 kUVToG; // 32
1436 lvec8 kUVToR; // 64 1419 lvec8 kUVToR; // 64
1437 lvec16 kUVBiasB; // 96 1420 lvec16 kUVBiasB; // 96
1438 lvec16 kUVBiasG; // 128 1421 lvec16 kUVBiasG; // 128
1439 lvec16 kUVBiasR; // 160 1422 lvec16 kUVBiasR; // 160
1440 lvec16 kYToRgb; // 192 1423 lvec16 kYToRgb; // 192
1441 }; 1424 };
1442 1425
1426 // BT.601 YUV to RGB reference
1427 // R = (Y - 16) * 1.164 - V * -1.596
1428 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
1429 // B = (Y - 16) * 1.164 - U * -2.018
1430
1431 // Y contribution to R,G,B. Scale and bias.
1432 // TODO(fbarchard): Consider moving constants into a common header.
1433 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1434 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1435
1436 // U and V contributions to R,G,B.
1437 #define UB -128 /* max(-128, round(-2.018 * 64)) */
1438 #define UG 25 /* round(0.391 * 64) */
1439 #define VG 52 /* round(0.813 * 64) */
1440 #define VR -102 /* round(-1.596 * 64) */
1441
1442 // Bias values to subtract 16 from Y and 128 from U and V.
1443 #define BB (UB * 128 + YGB)
1444 #define BG (UG * 128 + VG * 128 + YGB)
1445 #define BR (VR * 128 + YGB)
1446
1443 // BT601 constants for YUV to RGB. 1447 // BT601 constants for YUV to RGB.
1444 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { 1448 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
1445 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, 1449 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1446 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, 1450 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1447 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, 1451 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1448 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, 1452 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1449 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 1453 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1450 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, 1454 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1451 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, 1455 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1452 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, 1456 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1453 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, 1457 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1454 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } 1458 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1455 }; 1459 };
1456 1460
1457 // BT601 constants for NV21 where chroma plane is VU instead of UV. 1461 // BT601 constants for NV21 where chroma plane is VU instead of UV.
1458 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { 1462 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
1459 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 1463 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1460 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, 1464 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1461 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, 1465 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1462 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, 1466 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1463 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, 1467 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1464 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, 1468 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1465 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, 1469 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1466 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, 1470 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1467 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, 1471 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1468 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } 1472 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1469 }; 1473 };
1470 1474
1475 #undef YG
1476 #undef YGB
1477 #undef UB
1478 #undef UG
1479 #undef VG
1480 #undef VR
1481 #undef BB
1482 #undef BG
1483 #undef BR
1484
1485 // JPEG YUV to RGB reference
1486 // * R = Y - V * -1.40200
1487 // * G = Y - U * 0.34414 - V * 0.71414
1488 // * B = Y - U * -1.77200
1489
1490 // Y contribution to R,G,B. Scale and bias.
1491 // TODO(fbarchard): Consider moving constants into a common header.
1492 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1493 #define YGBJ 32 /* 64 / 2 */
1494
1495 // U and V contributions to R,G,B.
1496 #define UBJ -113 /* round(-1.77200 * 64) */
1497 #define UGJ 22 /* round(0.34414 * 64) */
1498 #define VGJ 46 /* round(0.71414 * 64) */
1499 #define VRJ -90 /* round(-1.40200 * 64) */
1500
1501 // Bias values to subtract 16 from Y and 128 from U and V.
1502 #define BBJ (UBJ * 128 + YGBJ)
1503 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
1504 #define BRJ (VRJ * 128 + YGBJ)
1505
1506 // JPEG constants for YUV to RGB.
1507 YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
1508 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
1509 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
1510 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1511 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1512 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1513 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
1514 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
1515 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
1516 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
1517 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
1518 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
1519 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
1520 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
1521 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
1522 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
1523 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
1524 };
1525
1526 #undef YGJ
1527 #undef YGBJ
1528 #undef UBJ
1529 #undef UGJ
1530 #undef VGJ
1531 #undef VRJ
1532 #undef BBJ
1533 #undef BGJ
1534 #undef BRJ
1535
1471 // Read 8 UV from 411 1536 // Read 8 UV from 411
1472 #define READYUV444 \ 1537 #define READYUV444 \
1473 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1538 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1474 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1539 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1475 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1540 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1476 "punpcklbw %%xmm1,%%xmm0 \n" 1541 "punpcklbw %%xmm1,%%xmm0 \n"
1477 1542
1478 // Read 4 UV from 422, upsample to 8 UV 1543 // Read 4 UV from 422, upsample to 8 UV
1479 #define READYUV422 \ 1544 #define READYUV422 \
1480 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1545 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
1527 "packuswb %%xmm2,%%xmm2 \n" 1592 "packuswb %%xmm2,%%xmm2 \n"
1528 1593
1529 // Store 8 ARGB values. Assumes XMM5 is zero. 1594 // Store 8 ARGB values. Assumes XMM5 is zero.
1530 #define STOREARGB \ 1595 #define STOREARGB \
1531 "punpcklbw %%xmm1,%%xmm0 \n" \ 1596 "punpcklbw %%xmm1,%%xmm0 \n" \
1532 "punpcklbw %%xmm5,%%xmm2 \n" \ 1597 "punpcklbw %%xmm5,%%xmm2 \n" \
1533 "movdqa %%xmm0,%%xmm1 \n" \ 1598 "movdqa %%xmm0,%%xmm1 \n" \
1534 "punpcklwd %%xmm2,%%xmm0 \n" \ 1599 "punpcklwd %%xmm2,%%xmm0 \n" \
1535 "punpckhwd %%xmm2,%%xmm1 \n" \ 1600 "punpckhwd %%xmm2,%%xmm1 \n" \
1536 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ 1601 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1537 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \ 1602 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
1538 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1603 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
1539 1604
1540 // Store 8 BGRA values. Assumes XMM5 is zero. 1605 // Store 8 BGRA values. Assumes XMM5 is zero.
1541 #define STOREBGRA \ 1606 #define STOREBGRA \
1542 "pcmpeqb %%xmm5,%%xmm5 \n" \ 1607 "pcmpeqb %%xmm5,%%xmm5 \n" \
1543 "punpcklbw %%xmm0,%%xmm1 \n" \ 1608 "punpcklbw %%xmm0,%%xmm1 \n" \
1544 "punpcklbw %%xmm2,%%xmm5 \n" \ 1609 "punpcklbw %%xmm2,%%xmm5 \n" \
1545 "movdqa %%xmm5,%%xmm0 \n" \ 1610 "movdqa %%xmm5,%%xmm0 \n" \
1546 "punpcklwd %%xmm1,%%xmm5 \n" \ 1611 "punpcklwd %%xmm1,%%xmm5 \n" \
1547 "punpckhwd %%xmm1,%%xmm0 \n" \ 1612 "punpckhwd %%xmm1,%%xmm0 \n" \
1548 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ 1613 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
1549 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \ 1614 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \
1550 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" 1615 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"
1551 1616
1552 // Store 8 ABGR values. Assumes XMM5 is zero. 1617 // Store 8 ABGR values. Assumes XMM5 is zero.
1553 #define STOREABGR \ 1618 #define STOREABGR \
1554 "punpcklbw %%xmm1,%%xmm2 \n" \ 1619 "punpcklbw %%xmm1,%%xmm2 \n" \
1555 "punpcklbw %%xmm5,%%xmm0 \n" \ 1620 "punpcklbw %%xmm5,%%xmm0 \n" \
1556 "movdqa %%xmm2,%%xmm1 \n" \ 1621 "movdqa %%xmm2,%%xmm1 \n" \
1557 "punpcklwd %%xmm0,%%xmm2 \n" \ 1622 "punpcklwd %%xmm0,%%xmm2 \n" \
1558 "punpckhwd %%xmm0,%%xmm1 \n" \ 1623 "punpckhwd %%xmm0,%%xmm1 \n" \
1559 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ 1624 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
1560 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \ 1625 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \
1561 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" 1626 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"
1562 1627
1563 // Store 8 RGBA values. Assumes XMM5 is zero. 1628 // Store 8 RGBA values. Assumes XMM5 is zero.
1564 #define STORERGBA \ 1629 #define STORERGBA \
1565 "pcmpeqb %%xmm5,%%xmm5 \n" \ 1630 "pcmpeqb %%xmm5,%%xmm5 \n" \
1566 "punpcklbw %%xmm2,%%xmm1 \n" \ 1631 "punpcklbw %%xmm2,%%xmm1 \n" \
1567 "punpcklbw %%xmm0,%%xmm5 \n" \ 1632 "punpcklbw %%xmm0,%%xmm5 \n" \
1568 "movdqa %%xmm5,%%xmm0 \n" \ 1633 "movdqa %%xmm5,%%xmm0 \n" \
1569 "punpcklwd %%xmm1,%%xmm5 \n" \ 1634 "punpcklwd %%xmm1,%%xmm5 \n" \
1570 "punpckhwd %%xmm1,%%xmm0 \n" \ 1635 "punpckhwd %%xmm1,%%xmm0 \n" \
1571 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ 1636 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1572 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \ 1637 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
1573 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" 1638 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
1574 1639
1575 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, 1640 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1576 const uint8* u_buf, 1641 const uint8* u_buf,
1577 const uint8* v_buf, 1642 const uint8* v_buf,
1578 uint8* dst_argb, 1643 uint8* dst_argb,
1579 int width) { 1644 int width) {
1580 asm volatile ( 1645 asm volatile (
1581 "sub %[u_buf],%[v_buf] \n" 1646 "sub %[u_buf],%[v_buf] \n"
1582 "pcmpeqb %%xmm5,%%xmm5 \n" 1647 "pcmpeqb %%xmm5,%%xmm5 \n"
1583 LABELALIGN 1648 LABELALIGN
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
1706 [u_buf]"+r"(u_buf), // %[u_buf] 1771 [u_buf]"+r"(u_buf), // %[u_buf]
1707 [v_buf]"+r"(v_buf), // %[v_buf] 1772 [v_buf]"+r"(v_buf), // %[v_buf]
1708 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1773 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1709 [width]"+rm"(width) // %[width] 1774 [width]"+rm"(width) // %[width]
1710 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1775 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1711 : "memory", "cc", NACL_R14 1776 : "memory", "cc", NACL_R14
1712 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1777 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1713 ); 1778 );
1714 } 1779 }
1715 1780
1781 void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
1782 const uint8* u_buf,
1783 const uint8* v_buf,
1784 uint8* dst_argb,
1785 int width) {
1786 asm volatile (
1787 "sub %[u_buf],%[v_buf] \n"
1788 "pcmpeqb %%xmm5,%%xmm5 \n"
1789 LABELALIGN
1790 "1: \n"
1791 READYUV422
1792 YUVTORGB(kYuvConstants)
1793 STOREARGB
1794 "sub $0x8,%[width] \n"
1795 "jg 1b \n"
1796 : [y_buf]"+r"(y_buf), // %[y_buf]
1797 [u_buf]"+r"(u_buf), // %[u_buf]
1798 [v_buf]"+r"(v_buf), // %[v_buf]
1799 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1800 [width]"+rm"(width) // %[width]
1801 : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
1802 : "memory", "cc", NACL_R14
1803 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1804 );
1805 }
1806
1716 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, 1807 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1717 const uint8* u_buf, 1808 const uint8* u_buf,
1718 const uint8* v_buf, 1809 const uint8* v_buf,
1719 uint8* dst_argb, 1810 uint8* dst_argb,
1720 int width) { 1811 int width) {
1721 asm volatile ( 1812 asm volatile (
1722 "sub %[u_buf],%[v_buf] \n" 1813 "sub %[u_buf],%[v_buf] \n"
1723 "pcmpeqb %%xmm5,%%xmm5 \n" 1814 "pcmpeqb %%xmm5,%%xmm5 \n"
1724 LABELALIGN 1815 LABELALIGN
1725 "1: \n" 1816 "1: \n"
(...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after
1874 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1965 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1875 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" 1966 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1876 1967
1877 // Convert 16 pixels: 16 UV and 16 Y. 1968 // Convert 16 pixels: 16 UV and 16 Y.
1878 #define YUVTORGB_AVX2(YuvConstants) \ 1969 #define YUVTORGB_AVX2(YuvConstants) \
1879 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ 1970 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \
1880 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ 1971 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \
1881 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ 1972 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
1882 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ 1973 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
1883 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ 1974 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
1884 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \ 1975 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \
1885 "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \ 1976 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
1886 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \ 1977 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \
1887 "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \ 1978 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
1888 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ 1979 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
1889 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ 1980 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
1890 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ 1981 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
1891 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ 1982 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \
1892 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ 1983 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \
1893 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ 1984 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \
1894 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ 1985 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \
1895 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ 1986 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \
1896 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 1987 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
1897 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 1988 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after
1977 [v_buf]"+r"(v_buf), // %[v_buf] 2068 [v_buf]"+r"(v_buf), // %[v_buf]
1978 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2069 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1979 [width]"+rm"(width) // %[width] 2070 [width]"+rm"(width) // %[width]
1980 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2071 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1981 : "memory", "cc", NACL_R14 2072 : "memory", "cc", NACL_R14
1982 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2073 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1983 ); 2074 );
1984 } 2075 }
1985 #endif // HAS_I422TOARGBROW_AVX2 2076 #endif // HAS_I422TOARGBROW_AVX2
1986 2077
2078 #if defined(HAS_J422TOARGBROW_AVX2)
2079 // 16 pixels
2080 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2081 void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
2082 const uint8* u_buf,
2083 const uint8* v_buf,
2084 uint8* dst_argb,
2085 int width) {
2086 asm volatile (
2087 "sub %[u_buf],%[v_buf] \n"
2088 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2089 LABELALIGN
2090 "1: \n"
2091 READYUV422_AVX2
2092 YUVTORGB_AVX2(kYuvConstants)
2093
2094 // Step 3: Weave into ARGB
2095 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
2096 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2097 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
2098 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2099 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
2100 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
2101
2102 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
2103 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
2104 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2105 "sub $0x10,%[width] \n"
2106 "jg 1b \n"
2107 "vzeroupper \n"
2108 : [y_buf]"+r"(y_buf), // %[y_buf]
2109 [u_buf]"+r"(u_buf), // %[u_buf]
2110 [v_buf]"+r"(v_buf), // %[v_buf]
2111 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2112 [width]"+rm"(width) // %[width]
2113 : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
2114 : "memory", "cc", NACL_R14
2115 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2116 );
2117 }
2118 #endif // HAS_J422TOARGBROW_AVX2
2119
1987 #if defined(HAS_I422TOABGRROW_AVX2) 2120 #if defined(HAS_I422TOABGRROW_AVX2)
1988 // 16 pixels 2121 // 16 pixels
1989 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). 2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
1990 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, 2123 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
1991 const uint8* u_buf, 2124 const uint8* u_buf,
1992 const uint8* v_buf, 2125 const uint8* v_buf,
1993 uint8* dst_argb, 2126 uint8* dst_argb,
1994 int width) { 2127 int width) {
1995 asm volatile ( 2128 asm volatile (
1996 "sub %[u_buf],%[v_buf] \n" 2129 "sub %[u_buf],%[v_buf] \n"
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
2059 [v_buf]"+r"(v_buf), // %[v_buf] 2192 [v_buf]"+r"(v_buf), // %[v_buf]
2060 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2193 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2061 [width]"+rm"(width) // %[width] 2194 [width]"+rm"(width) // %[width]
2062 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2195 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2063 : "memory", "cc", NACL_R14 2196 : "memory", "cc", NACL_R14
2064 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2197 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2065 ); 2198 );
2066 } 2199 }
2067 #endif // HAS_I422TORGBAROW_AVX2 2200 #endif // HAS_I422TORGBAROW_AVX2
2068 2201
2069 #ifdef HAS_YTOARGBROW_SSE2 2202 #ifdef HAS_I400TOARGBROW_SSE2
2070 void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { 2203 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2071 asm volatile ( 2204 asm volatile (
2072 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 2205 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2073 "movd %%eax,%%xmm2 \n" 2206 "movd %%eax,%%xmm2 \n"
2074 "pshufd $0x0,%%xmm2,%%xmm2 \n" 2207 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2075 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 2208 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
2076 "movd %%eax,%%xmm3 \n" 2209 "movd %%eax,%%xmm3 \n"
2077 "pshufd $0x0,%%xmm3,%%xmm3 \n" 2210 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2078 "pcmpeqb %%xmm4,%%xmm4 \n" 2211 "pcmpeqb %%xmm4,%%xmm4 \n"
2079 "pslld $0x18,%%xmm4 \n" 2212 "pslld $0x18,%%xmm4 \n"
2080 LABELALIGN 2213 LABELALIGN
(...skipping 21 matching lines...) Expand all
2102 "sub $0x8,%2 \n" 2235 "sub $0x8,%2 \n"
2103 "jg 1b \n" 2236 "jg 1b \n"
2104 : "+r"(y_buf), // %0 2237 : "+r"(y_buf), // %0
2105 "+r"(dst_argb), // %1 2238 "+r"(dst_argb), // %1
2106 "+rm"(width) // %2 2239 "+rm"(width) // %2
2107 : 2240 :
2108 : "memory", "cc", "eax" 2241 : "memory", "cc", "eax"
2109 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 2242 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2110 ); 2243 );
2111 } 2244 }
2112 #endif // HAS_YTOARGBROW_SSE2 2245 #endif // HAS_I400TOARGBROW_SSE2
2113 2246
2114 #ifdef HAS_YTOARGBROW_AVX2 2247 #ifdef HAS_I400TOARGBROW_AVX2
2115 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). 2248 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2116 // note: vpunpcklbw mutates and vpackuswb unmutates. 2249 // note: vpunpcklbw mutates and vpackuswb unmutates.
2117 void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { 2250 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2118 asm volatile ( 2251 asm volatile (
2119 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 2252 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
2120 "vmovd %%eax,%%xmm2 \n" 2253 "vmovd %%eax,%%xmm2 \n"
2121 "vbroadcastss %%xmm2,%%ymm2 \n" 2254 "vbroadcastss %%xmm2,%%ymm2 \n"
2122 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 2255 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
2123 "vmovd %%eax,%%xmm3 \n" 2256 "vmovd %%eax,%%xmm3 \n"
2124 "vbroadcastss %%xmm3,%%ymm3 \n" 2257 "vbroadcastss %%xmm3,%%ymm3 \n"
2125 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 2258 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
2126 "vpslld $0x18,%%ymm4,%%ymm4 \n" 2259 "vpslld $0x18,%%ymm4,%%ymm4 \n"
2127 2260
(...skipping 21 matching lines...) Expand all
2149 "jg 1b \n" 2282 "jg 1b \n"
2150 "vzeroupper \n" 2283 "vzeroupper \n"
2151 : "+r"(y_buf), // %0 2284 : "+r"(y_buf), // %0
2152 "+r"(dst_argb), // %1 2285 "+r"(dst_argb), // %1
2153 "+rm"(width) // %2 2286 "+rm"(width) // %2
2154 : 2287 :
2155 : "memory", "cc", "eax" 2288 : "memory", "cc", "eax"
2156 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 2289 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2157 ); 2290 );
2158 } 2291 }
2159 #endif // HAS_YTOARGBROW_AVX2 2292 #endif // HAS_I400TOARGBROW_AVX2
2160 2293
2161 #ifdef HAS_MIRRORROW_SSSE3 2294 #ifdef HAS_MIRRORROW_SSSE3
2162 // Shuffle table for reversing the bytes. 2295 // Shuffle table for reversing the bytes.
2163 static uvec8 kShuffleMirror = { 2296 static uvec8 kShuffleMirror = {
2164 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 2297 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2165 }; 2298 };
2166 2299
2167 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 2300 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2168 intptr_t temp_width = (intptr_t)(width); 2301 intptr_t temp_width = (intptr_t)(width);
2169 asm volatile ( 2302 asm volatile (
(...skipping 919 matching lines...) Expand 10 before | Expand all | Expand 10 after
3089 uint8* dst_argb, int width) { 3222 uint8* dst_argb, int width) {
3090 asm volatile ( 3223 asm volatile (
3091 "pcmpeqb %%xmm7,%%xmm7 \n" 3224 "pcmpeqb %%xmm7,%%xmm7 \n"
3092 "psrlw $0xf,%%xmm7 \n" 3225 "psrlw $0xf,%%xmm7 \n"
3093 "pcmpeqb %%xmm6,%%xmm6 \n" 3226 "pcmpeqb %%xmm6,%%xmm6 \n"
3094 "psrlw $0x8,%%xmm6 \n" 3227 "psrlw $0x8,%%xmm6 \n"
3095 "pcmpeqb %%xmm5,%%xmm5 \n" 3228 "pcmpeqb %%xmm5,%%xmm5 \n"
3096 "psllw $0x8,%%xmm5 \n" 3229 "psllw $0x8,%%xmm5 \n"
3097 "pcmpeqb %%xmm4,%%xmm4 \n" 3230 "pcmpeqb %%xmm4,%%xmm4 \n"
3098 "pslld $0x18,%%xmm4 \n" 3231 "pslld $0x18,%%xmm4 \n"
3099 "sub $0x1,%3 \n" 3232 "sub $0x4,%3 \n"
3100 "je 91f \n"
3101 "jl 99f \n"
3102
3103 // 1 pixel loop until destination pointer is aligned.
3104 "10: \n"
3105 "test $0xf,%2 \n"
3106 "je 19f \n"
3107 "movd " MEMACCESS(0) ",%%xmm3 \n"
3108 "lea " MEMLEA(0x4,0) ",%0 \n"
3109 "movdqa %%xmm3,%%xmm0 \n"
3110 "pxor %%xmm4,%%xmm3 \n"
3111 "movd " MEMACCESS(1) ",%%xmm2 \n"
3112 "psrlw $0x8,%%xmm3 \n"
3113 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3114 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3115 "pand %%xmm6,%%xmm2 \n"
3116 "paddw %%xmm7,%%xmm3 \n"
3117 "pmullw %%xmm3,%%xmm2 \n"
3118 "movd " MEMACCESS(1) ",%%xmm1 \n"
3119 "lea " MEMLEA(0x4,1) ",%1 \n"
3120 "psrlw $0x8,%%xmm1 \n"
3121 "por %%xmm4,%%xmm0 \n"
3122 "pmullw %%xmm3,%%xmm1 \n"
3123 "psrlw $0x8,%%xmm2 \n"
3124 "paddusb %%xmm2,%%xmm0 \n"
3125 "pand %%xmm5,%%xmm1 \n"
3126 "paddusb %%xmm1,%%xmm0 \n"
3127 "movd %%xmm0," MEMACCESS(2) " \n"
3128 "lea " MEMLEA(0x4,2) ",%2 \n"
3129 "sub $0x1,%3 \n"
3130 "jge 10b \n"
3131
3132 "19: \n"
3133 "add $1-4,%3 \n"
3134 "jl 49f \n" 3233 "jl 49f \n"
3135 3234
3136 // 4 pixel loop. 3235 // 4 pixel loop.
3137 LABELALIGN 3236 LABELALIGN
3138 "41: \n" 3237 "41: \n"
3139 "movdqu " MEMACCESS(0) ",%%xmm3 \n" 3238 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3140 "lea " MEMLEA(0x10,0) ",%0 \n" 3239 "lea " MEMLEA(0x10,0) ",%0 \n"
3141 "movdqa %%xmm3,%%xmm0 \n" 3240 "movdqa %%xmm3,%%xmm0 \n"
3142 "pxor %%xmm4,%%xmm3 \n" 3241 "pxor %%xmm4,%%xmm3 \n"
3143 "movdqu " MEMACCESS(1) ",%%xmm2 \n" 3242 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
3224 uint8* dst_argb, int width) { 3323 uint8* dst_argb, int width) {
3225 asm volatile ( 3324 asm volatile (
3226 "pcmpeqb %%xmm7,%%xmm7 \n" 3325 "pcmpeqb %%xmm7,%%xmm7 \n"
3227 "psrlw $0xf,%%xmm7 \n" 3326 "psrlw $0xf,%%xmm7 \n"
3228 "pcmpeqb %%xmm6,%%xmm6 \n" 3327 "pcmpeqb %%xmm6,%%xmm6 \n"
3229 "psrlw $0x8,%%xmm6 \n" 3328 "psrlw $0x8,%%xmm6 \n"
3230 "pcmpeqb %%xmm5,%%xmm5 \n" 3329 "pcmpeqb %%xmm5,%%xmm5 \n"
3231 "psllw $0x8,%%xmm5 \n" 3330 "psllw $0x8,%%xmm5 \n"
3232 "pcmpeqb %%xmm4,%%xmm4 \n" 3331 "pcmpeqb %%xmm4,%%xmm4 \n"
3233 "pslld $0x18,%%xmm4 \n" 3332 "pslld $0x18,%%xmm4 \n"
3234 "sub $0x1,%3 \n" 3333 "sub $0x4,%3 \n"
3235 "je 91f \n"
3236 "jl 99f \n"
3237
3238 // 1 pixel loop until destination pointer is aligned.
3239 "10: \n"
3240 "test $0xf,%2 \n"
3241 "je 19f \n"
3242 "movd " MEMACCESS(0) ",%%xmm3 \n"
3243 "lea " MEMLEA(0x4,0) ",%0 \n"
3244 "movdqa %%xmm3,%%xmm0 \n"
3245 "pxor %%xmm4,%%xmm3 \n"
3246 "movd " MEMACCESS(1) ",%%xmm2 \n"
3247 "pshufb %4,%%xmm3 \n"
3248 "pand %%xmm6,%%xmm2 \n"
3249 "paddw %%xmm7,%%xmm3 \n"
3250 "pmullw %%xmm3,%%xmm2 \n"
3251 "movd " MEMACCESS(1) ",%%xmm1 \n"
3252 "lea " MEMLEA(0x4,1) ",%1 \n"
3253 "psrlw $0x8,%%xmm1 \n"
3254 "por %%xmm4,%%xmm0 \n"
3255 "pmullw %%xmm3,%%xmm1 \n"
3256 "psrlw $0x8,%%xmm2 \n"
3257 "paddusb %%xmm2,%%xmm0 \n"
3258 "pand %%xmm5,%%xmm1 \n"
3259 "paddusb %%xmm1,%%xmm0 \n"
3260 "movd %%xmm0," MEMACCESS(2) " \n"
3261 "lea " MEMLEA(0x4,2) ",%2 \n"
3262 "sub $0x1,%3 \n"
3263 "jge 10b \n"
3264
3265 "19: \n"
3266 "add $1-4,%3 \n"
3267 "jl 49f \n" 3334 "jl 49f \n"
3268 3335
3269 // 4 pixel loop. 3336 // 4 pixel loop.
3270 LABELALIGN 3337 LABELALIGN
3271 "40: \n" 3338 "40: \n"
3272 "movdqu " MEMACCESS(0) ",%%xmm3 \n" 3339 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3273 "lea " MEMLEA(0x10,0) ",%0 \n" 3340 "lea " MEMLEA(0x10,0) ",%0 \n"
3274 "movdqa %%xmm3,%%xmm0 \n" 3341 "movdqa %%xmm3,%%xmm0 \n"
3275 "pxor %%xmm4,%%xmm3 \n" 3342 "pxor %%xmm4,%%xmm3 \n"
3276 "movdqu " MEMACCESS(1) ",%%xmm2 \n" 3343 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
(...skipping 1613 matching lines...) Expand 10 before | Expand all | Expand 10 after
4890 "+r"(src_ptr), // %1 4957 "+r"(src_ptr), // %1
4891 "+r"(dst_width), // %2 4958 "+r"(dst_width), // %2
4892 "+r"(source_y_fraction) // %3 4959 "+r"(source_y_fraction) // %3
4893 : "r"((intptr_t)(src_stride)) // %4 4960 : "r"((intptr_t)(src_stride)) // %4
4894 : "memory", "cc", NACL_R14 4961 : "memory", "cc", NACL_R14
4895 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 4962 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4896 ); 4963 );
4897 } 4964 }
4898 #endif // HAS_INTERPOLATEROW_SSE2 4965 #endif // HAS_INTERPOLATEROW_SSE2
4899 4966
4900 #ifdef HAS_ARGBTOBAYERGGROW_SSE2
4901 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
4902 uint32 selector, int pix) {
4903 asm volatile (
4904 "pcmpeqb %%xmm5,%%xmm5 \n"
4905 "psrld $0x18,%%xmm5 \n"
4906 LABELALIGN
4907 "1: \n"
4908 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4909 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4910 "lea " MEMLEA(0x20,0) ",%0 \n"
4911 "psrld $0x8,%%xmm0 \n"
4912 "psrld $0x8,%%xmm1 \n"
4913 "pand %%xmm5,%%xmm0 \n"
4914 "pand %%xmm5,%%xmm1 \n"
4915 "packssdw %%xmm1,%%xmm0 \n"
4916 "packuswb %%xmm1,%%xmm0 \n"
4917 "movq %%xmm0," MEMACCESS(1) " \n"
4918 "lea " MEMLEA(0x8,1) ",%1 \n"
4919 "sub $0x8,%2 \n"
4920 "jg 1b \n"
4921 : "+r"(src_argb), // %0
4922 "+r"(dst_bayer), // %1
4923 "+r"(pix) // %2
4924 :
4925 : "memory", "cc"
4926 , "xmm0", "xmm1", "xmm5"
4927 );
4928 }
4929 #endif // HAS_ARGBTOBAYERGGROW_SSE2
4930
4931 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 4967 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
4932 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 4968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
4933 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 4969 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4934 const uint8* shuffler, int pix) { 4970 const uint8* shuffler, int pix) {
4935 asm volatile ( 4971 asm volatile (
4936 "movdqu " MEMACCESS(3) ",%%xmm5 \n" 4972 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
4937 LABELALIGN 4973 LABELALIGN
4938 "1: \n" 4974 "1: \n"
4939 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4975 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4940 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4976 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
(...skipping 489 matching lines...) Expand 10 before | Expand all | Expand 10 after
5430 ); 5466 );
5431 } 5467 }
5432 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5468 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5433 5469
5434 #endif // defined(__x86_64__) || defined(__i386__) 5470 #endif // defined(__x86_64__) || defined(__i386__)
5435 5471
5436 #ifdef __cplusplus 5472 #ifdef __cplusplus
5437 } // extern "C" 5473 } // extern "C"
5438 } // namespace libyuv 5474 } // namespace libyuv
5439 #endif 5475 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_common.cc ('k') | source/libvpx/third_party/libyuv/source/row_neon.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698