source/libvpx/third_party/libyuv/source/row_gcc.cc - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/third_party/libyuv/source/row_gcc.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // VERSION 2	1 // VERSION 2

2 /*	2 /*

3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

4 *	4 *

5 * Use of this source code is governed by a BSD-style license	5 * Use of this source code is governed by a BSD-style license

6 * that can be found in the LICENSE file in the root of the source	6 * that can be found in the LICENSE file in the root of the source

7 * tree. An additional intellectual property rights grant can be found	7 * tree. An additional intellectual property rights grant can be found

8 * in the file PATENTS. All contributing project authors may	8 * in the file PATENTS. All contributing project authors may

9 * be found in the AUTHORS file in the root of the source tree.	9 * be found in the AUTHORS file in the root of the source tree.

10 */	10 */

(...skipping 218 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
229 "jg 1b \n"	229 "jg 1b \n"

230 : "+r"(src_y), // %0	230 : "+r"(src_y), // %0

231 "+r"(dst_argb), // %1	231 "+r"(dst_argb), // %1

232 "+r"(pix) // %2	232 "+r"(pix) // %2

233 :	233 :

234 : "memory", "cc", "xmm0", "xmm1", "xmm5"	234 : "memory", "cc", "xmm0", "xmm1", "xmm5"

235 );	235 );

236 }	236 }

237 #endif // TESTING	237 #endif // TESTING

238	238

239 #ifdef HAS_I400TOARGBROW_SSE2	239 #ifdef HAS_J400TOARGBROW_SSE2

240 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {	240 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {

241 asm volatile (	241 asm volatile (

242 "pcmpeqb %%xmm5,%%xmm5 \n"	242 "pcmpeqb %%xmm5,%%xmm5 \n"

243 "pslld $0x18,%%xmm5 \n"	243 "pslld $0x18,%%xmm5 \n"

244 LABELALIGN	244 LABELALIGN

245 "1: \n"	245 "1: \n"

246 "movq " MEMACCESS(0) ",%%xmm0 \n"	246 "movq " MEMACCESS(0) ",%%xmm0 \n"

247 "lea " MEMLEA(0x8,0) ",%0 \n"	247 "lea " MEMLEA(0x8,0) ",%0 \n"

248 "punpcklbw %%xmm0,%%xmm0 \n"	248 "punpcklbw %%xmm0,%%xmm0 \n"

249 "movdqa %%xmm0,%%xmm1 \n"	249 "movdqa %%xmm0,%%xmm1 \n"

250 "punpcklwd %%xmm0,%%xmm0 \n"	250 "punpcklwd %%xmm0,%%xmm0 \n"

251 "punpckhwd %%xmm1,%%xmm1 \n"	251 "punpckhwd %%xmm1,%%xmm1 \n"

252 "por %%xmm5,%%xmm0 \n"	252 "por %%xmm5,%%xmm0 \n"

253 "por %%xmm5,%%xmm1 \n"	253 "por %%xmm5,%%xmm1 \n"

254 "movdqu %%xmm0," MEMACCESS(1) " \n"	254 "movdqu %%xmm0," MEMACCESS(1) " \n"

255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"	255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"

256 "lea " MEMLEA(0x20,1) ",%1 \n"	256 "lea " MEMLEA(0x20,1) ",%1 \n"

257 "sub $0x8,%2 \n"	257 "sub $0x8,%2 \n"

258 "jg 1b \n"	258 "jg 1b \n"

259 : "+r"(src_y), // %0	259 : "+r"(src_y), // %0

260 "+r"(dst_argb), // %1	260 "+r"(dst_argb), // %1

261 "+r"(pix) // %2	261 "+r"(pix) // %2

262 :: "memory", "cc", "xmm0", "xmm1", "xmm5"	262 :: "memory", "cc", "xmm0", "xmm1", "xmm5"

263 );	263 );

264 }	264 }

265 #endif // HAS_I400TOARGBROW_SSE2	265 #endif // HAS_J400TOARGBROW_SSE2

266	266

267 #ifdef HAS_RGB24TOARGBROW_SSSE3	267 #ifdef HAS_RGB24TOARGBROW_SSSE3

268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {	268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {

269 asm volatile (	269 asm volatile (

270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000	270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000

271 "pslld $0x18,%%xmm5 \n"	271 "pslld $0x18,%%xmm5 \n"

272 "movdqa %3,%%xmm4 \n"	272 "movdqa %3,%%xmm4 \n"

273 LABELALIGN	273 LABELALIGN

274 "1: \n"	274 "1: \n"

275 "movdqu " MEMACCESS(0) ",%%xmm0 \n"	275 "movdqu " MEMACCESS(0) ",%%xmm0 \n"

(...skipping 670 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
946 "m"(kARGBToV), // %6	946 "m"(kARGBToV), // %6

947 "m"(kARGBToU), // %7	947 "m"(kARGBToU), // %7

948 "m"(kShufARGBToUV_AVX) // %8	948 "m"(kShufARGBToUV_AVX) // %8

949 : "memory", "cc", NACL_R14	949 : "memory", "cc", NACL_R14

950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"	950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

951 );	951 );

952 }	952 }

953 #endif // HAS_ARGBTOUVROW_AVX2	953 #endif // HAS_ARGBTOUVROW_AVX2

954	954

955 #ifdef HAS_ARGBTOUVJROW_SSSE3	955 #ifdef HAS_ARGBTOUVJROW_SSSE3

956 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.

957 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,	956 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

958 uint8* dst_u, uint8* dst_v, int width) {	957 uint8* dst_u, uint8* dst_v, int width) {

959 asm volatile (	958 asm volatile (

960 "movdqa %5,%%xmm3 \n"	959 "movdqa %5,%%xmm3 \n"

961 "movdqa %6,%%xmm4 \n"	960 "movdqa %6,%%xmm4 \n"

962 "movdqa %7,%%xmm5 \n"	961 "movdqa %7,%%xmm5 \n"

963 "sub %1,%2 \n"	962 "sub %1,%2 \n"

964 LABELALIGN	963 LABELALIGN

965 "1: \n"	964 "1: \n"

966 "movdqu " MEMACCESS(0) ",%%xmm0 \n"	965 "movdqu " MEMACCESS(0) ",%%xmm0 \n"

(...skipping 440 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1407 "m"(kRGBAToV), // %5	1406 "m"(kRGBAToV), // %5

1408 "m"(kRGBAToU), // %6	1407 "m"(kRGBAToU), // %6

1409 "m"(kAddUV128) // %7	1408 "m"(kAddUV128) // %7

1410 : "memory", "cc", NACL_R14	1409 : "memory", "cc", NACL_R14

1411 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"	1410 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

1412 );	1411 );

1413 }	1412 }

1414	1413

1415 #if defined(HAS_I422TOARGBROW_SSSE3) \|\| defined(HAS_I422TOARGBROW_AVX2)	1414 #if defined(HAS_I422TOARGBROW_SSSE3) \|\| defined(HAS_I422TOARGBROW_AVX2)

1416	1415

1417 // YUV to RGB conversion constants.

1418 // Y contribution to R,G,B. Scale and bias.

1419 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

1420 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */

1421

1422 // U and V contributions to R,G,B.

1423 #define UB -128 /* -min(128, round(2.018 * 64)) */

1424 #define UG 25 /* -round(-0.391 * 64) */

1425 #define VG 52 /* -round(-0.813 * 64) */

1426 #define VR -102 /* -round(1.596 * 64) */

1427

1428 // Bias values to subtract 16 from Y and 128 from U and V.

1429 #define BB (UB * 128 - YGB)

1430 #define BG (UG * 128 + VG * 128 - YGB)

1431 #define BR (VR * 128 - YGB)

1432

1433 struct YuvConstants {	1416 struct YuvConstants {

1434 lvec8 kUVToB; // 0	1417 lvec8 kUVToB; // 0

1435 lvec8 kUVToG; // 32	1418 lvec8 kUVToG; // 32

1436 lvec8 kUVToR; // 64	1419 lvec8 kUVToR; // 64

1437 lvec16 kUVBiasB; // 96	1420 lvec16 kUVBiasB; // 96

1438 lvec16 kUVBiasG; // 128	1421 lvec16 kUVBiasG; // 128

1439 lvec16 kUVBiasR; // 160	1422 lvec16 kUVBiasR; // 160

1440 lvec16 kYToRgb; // 192	1423 lvec16 kYToRgb; // 192

1441 };	1424 };

1442	1425

	1426 // BT.601 YUV to RGB reference

	1427 // R = (Y - 16) * 1.164 - V * -1.596

	1428 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813

	1429 // B = (Y - 16) * 1.164 - U * -2.018

	1430

	1431 // Y contribution to R,G,B. Scale and bias.

	1432 // TODO(fbarchard): Consider moving constants into a common header.

	1433 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

	1434 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

	1435

	1436 // U and V contributions to R,G,B.

	1437 #define UB -128 /* max(-128, round(-2.018 * 64)) */

	1438 #define UG 25 /* round(0.391 * 64) */

	1439 #define VG 52 /* round(0.813 * 64) */

	1440 #define VR -102 /* round(-1.596 * 64) */

	1441

	1442 // Bias values to subtract 16 from Y and 128 from U and V.

	1443 #define BB (UB * 128 + YGB)

	1444 #define BG (UG * 128 + VG * 128 + YGB)

	1445 #define BR (VR * 128 + YGB)

	1446

1443 // BT601 constants for YUV to RGB.	1447 // BT601 constants for YUV to RGB.

1444 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {	1448 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {

1445 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,	1449 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

1446 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },	1450 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },

1447 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,	1451 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

1448 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },	1452 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

1449 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,	1453 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

1450 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },	1454 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },

1451 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },	1455 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

1452 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },	1456 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

1453 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },	1457 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

1454 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }	1458 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

1455 };	1459 };

1456	1460

1457 // BT601 constants for NV21 where chroma plane is VU instead of UV.	1461 // BT601 constants for NV21 where chroma plane is VU instead of UV.

1458 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {	1462 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {

1459 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,	1463 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

1460 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },	1464 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },

1461 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,	1465 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

1462 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },	1466 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

1463 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,	1467 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

1464 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },	1468 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },

1465 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },	1469 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

1466 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },	1470 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

1467 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },	1471 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

1468 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }	1472 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

1469 };	1473 };

1470	1474

	1475 #undef YG

	1476 #undef YGB

	1477 #undef UB

	1478 #undef UG

	1479 #undef VG

	1480 #undef VR

	1481 #undef BB

	1482 #undef BG

	1483 #undef BR

	1484

	1485 // JPEG YUV to RGB reference

	1486 // * R = Y - V * -1.40200

	1487 // * G = Y - U * 0.34414 - V * 0.71414

	1488 // * B = Y - U * -1.77200

	1489

	1490 // Y contribution to R,G,B. Scale and bias.

	1491 // TODO(fbarchard): Consider moving constants into a common header.

	1492 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */

	1493 #define YGBJ 32 /* 64 / 2 */

	1494

	1495 // U and V contributions to R,G,B.

	1496 #define UBJ -113 /* round(-1.77200 * 64) */

	1497 #define UGJ 22 /* round(0.34414 * 64) */

	1498 #define VGJ 46 /* round(0.71414 * 64) */

	1499 #define VRJ -90 /* round(-1.40200 * 64) */

	1500

	1501 // Bias values to subtract 16 from Y and 128 from U and V.

	1502 #define BBJ (UBJ * 128 + YGBJ)

	1503 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)

	1504 #define BRJ (VRJ * 128 + YGBJ)

	1505

	1506 // JPEG constants for YUV to RGB.

	1507 YuvConstants SIMD_ALIGNED(kYuvJConstants) = {

	1508 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,

	1509 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },

	1510 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

	1511 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

	1512 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,

	1513 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },

	1514 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,

	1515 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },

	1516 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,

	1517 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },

	1518 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,

	1519 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },

	1520 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,

	1521 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },

	1522 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,

	1523 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }

	1524 };

	1525

	1526 #undef YGJ

	1527 #undef YGBJ

	1528 #undef UBJ

	1529 #undef UGJ

	1530 #undef VGJ

	1531 #undef VRJ

	1532 #undef BBJ

	1533 #undef BGJ

	1534 #undef BRJ

	1535

1471 // Read 8 UV from 411	1536 // Read 8 UV from 411

1472 #define READYUV444 \	1537 #define READYUV444 \

1473 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \	1538 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \

1474 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \	1539 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \

1475 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \	1540 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \

1476 "punpcklbw %%xmm1,%%xmm0 \n"	1541 "punpcklbw %%xmm1,%%xmm0 \n"

1477	1542

1478 // Read 4 UV from 422, upsample to 8 UV	1543 // Read 4 UV from 422, upsample to 8 UV

1479 #define READYUV422 \	1544 #define READYUV422 \

1480 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \	1545 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1527 "packuswb %%xmm2,%%xmm2 \n"	1592 "packuswb %%xmm2,%%xmm2 \n"

1528	1593

1529 // Store 8 ARGB values. Assumes XMM5 is zero.	1594 // Store 8 ARGB values. Assumes XMM5 is zero.

1530 #define STOREARGB \	1595 #define STOREARGB \

1531 "punpcklbw %%xmm1,%%xmm0 \n" \	1596 "punpcklbw %%xmm1,%%xmm0 \n" \

1532 "punpcklbw %%xmm5,%%xmm2 \n" \	1597 "punpcklbw %%xmm5,%%xmm2 \n" \

1533 "movdqa %%xmm0,%%xmm1 \n" \	1598 "movdqa %%xmm0,%%xmm1 \n" \

1534 "punpcklwd %%xmm2,%%xmm0 \n" \	1599 "punpcklwd %%xmm2,%%xmm0 \n" \

1535 "punpckhwd %%xmm2,%%xmm1 \n" \	1600 "punpckhwd %%xmm2,%%xmm1 \n" \

1536 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \	1601 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \

1537 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \	1602 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \

1538 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"	1603 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"

1539	1604

1540 // Store 8 BGRA values. Assumes XMM5 is zero.	1605 // Store 8 BGRA values. Assumes XMM5 is zero.

1541 #define STOREBGRA \	1606 #define STOREBGRA \

1542 "pcmpeqb %%xmm5,%%xmm5 \n" \	1607 "pcmpeqb %%xmm5,%%xmm5 \n" \

1543 "punpcklbw %%xmm0,%%xmm1 \n" \	1608 "punpcklbw %%xmm0,%%xmm1 \n" \

1544 "punpcklbw %%xmm2,%%xmm5 \n" \	1609 "punpcklbw %%xmm2,%%xmm5 \n" \

1545 "movdqa %%xmm5,%%xmm0 \n" \	1610 "movdqa %%xmm5,%%xmm0 \n" \

1546 "punpcklwd %%xmm1,%%xmm5 \n" \	1611 "punpcklwd %%xmm1,%%xmm5 \n" \

1547 "punpckhwd %%xmm1,%%xmm0 \n" \	1612 "punpckhwd %%xmm1,%%xmm0 \n" \

1548 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \	1613 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \

1549 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \	1614 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \

1550 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"	1615 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"

1551	1616

1552 // Store 8 ABGR values. Assumes XMM5 is zero.	1617 // Store 8 ABGR values. Assumes XMM5 is zero.

1553 #define STOREABGR \	1618 #define STOREABGR \

1554 "punpcklbw %%xmm1,%%xmm2 \n" \	1619 "punpcklbw %%xmm1,%%xmm2 \n" \

1555 "punpcklbw %%xmm5,%%xmm0 \n" \	1620 "punpcklbw %%xmm5,%%xmm0 \n" \

1556 "movdqa %%xmm2,%%xmm1 \n" \	1621 "movdqa %%xmm2,%%xmm1 \n" \

1557 "punpcklwd %%xmm0,%%xmm2 \n" \	1622 "punpcklwd %%xmm0,%%xmm2 \n" \

1558 "punpckhwd %%xmm0,%%xmm1 \n" \	1623 "punpckhwd %%xmm0,%%xmm1 \n" \

1559 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \	1624 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \

1560 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \	1625 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \

1561 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"	1626 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"

1562	1627

1563 // Store 8 RGBA values. Assumes XMM5 is zero.	1628 // Store 8 RGBA values. Assumes XMM5 is zero.

1564 #define STORERGBA \	1629 #define STORERGBA \

1565 "pcmpeqb %%xmm5,%%xmm5 \n" \	1630 "pcmpeqb %%xmm5,%%xmm5 \n" \

1566 "punpcklbw %%xmm2,%%xmm1 \n" \	1631 "punpcklbw %%xmm2,%%xmm1 \n" \

1567 "punpcklbw %%xmm0,%%xmm5 \n" \	1632 "punpcklbw %%xmm0,%%xmm5 \n" \

1568 "movdqa %%xmm5,%%xmm0 \n" \	1633 "movdqa %%xmm5,%%xmm0 \n" \

1569 "punpcklwd %%xmm1,%%xmm5 \n" \	1634 "punpcklwd %%xmm1,%%xmm5 \n" \

1570 "punpckhwd %%xmm1,%%xmm0 \n" \	1635 "punpckhwd %%xmm1,%%xmm0 \n" \

1571 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \	1636 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \

1572 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \	1637 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \

1573 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"	1638 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"

1574	1639

1575 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,	1640 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,

1576 const uint8* u_buf,	1641 const uint8* u_buf,

1577 const uint8* v_buf,	1642 const uint8* v_buf,

1578 uint8* dst_argb,	1643 uint8* dst_argb,

1579 int width) {	1644 int width) {

1580 asm volatile (	1645 asm volatile (

1581 "sub %[u_buf],%[v_buf] \n"	1646 "sub %[u_buf],%[v_buf] \n"

1582 "pcmpeqb %%xmm5,%%xmm5 \n"	1647 "pcmpeqb %%xmm5,%%xmm5 \n"

1583 LABELALIGN	1648 LABELALIGN

(...skipping 122 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1706 [u_buf]"+r"(u_buf), // %[u_buf]	1771 [u_buf]"+r"(u_buf), // %[u_buf]

1707 [v_buf]"+r"(v_buf), // %[v_buf]	1772 [v_buf]"+r"(v_buf), // %[v_buf]

1708 [dst_argb]"+r"(dst_argb), // %[dst_argb]	1773 [dst_argb]"+r"(dst_argb), // %[dst_argb]

1709 [width]"+rm"(width) // %[width]	1774 [width]"+rm"(width) // %[width]

1710 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]	1775 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

1711 : "memory", "cc", NACL_R14	1776 : "memory", "cc", NACL_R14

1712 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"	1777 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

1713 );	1778 );

1714 }	1779 }

1715	1780

	1781 void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,

	1782 const uint8* u_buf,

	1783 const uint8* v_buf,

	1784 uint8* dst_argb,

	1785 int width) {

	1786 asm volatile (

	1787 "sub %[u_buf],%[v_buf] \n"

	1788 "pcmpeqb %%xmm5,%%xmm5 \n"

	1789 LABELALIGN

	1790 "1: \n"

	1791 READYUV422

	1792 YUVTORGB(kYuvConstants)

	1793 STOREARGB

	1794 "sub $0x8,%[width] \n"

	1795 "jg 1b \n"

	1796 : [y_buf]"+r"(y_buf), // %[y_buf]

	1797 [u_buf]"+r"(u_buf), // %[u_buf]

	1798 [v_buf]"+r"(v_buf), // %[v_buf]

	1799 [dst_argb]"+r"(dst_argb), // %[dst_argb]

	1800 [width]"+rm"(width) // %[width]

	1801 : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]

	1802 : "memory", "cc", NACL_R14

	1803 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

	1804 );

	1805 }

	1806

1716 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,	1807 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,

1717 const uint8* u_buf,	1808 const uint8* u_buf,

1718 const uint8* v_buf,	1809 const uint8* v_buf,

1719 uint8* dst_argb,	1810 uint8* dst_argb,

1720 int width) {	1811 int width) {

1721 asm volatile (	1812 asm volatile (

1722 "sub %[u_buf],%[v_buf] \n"	1813 "sub %[u_buf],%[v_buf] \n"

1723 "pcmpeqb %%xmm5,%%xmm5 \n"	1814 "pcmpeqb %%xmm5,%%xmm5 \n"

1724 LABELALIGN	1815 LABELALIGN

1725 "1: \n"	1816 "1: \n"

(...skipping 148 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1874 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \	1965 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \

1875 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"	1966 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"

1876	1967

1877 // Convert 16 pixels: 16 UV and 16 Y.	1968 // Convert 16 pixels: 16 UV and 16 Y.

1878 #define YUVTORGB_AVX2(YuvConstants) \	1969 #define YUVTORGB_AVX2(YuvConstants) \

1879 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \	1970 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \

1880 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \	1971 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \

1881 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \	1972 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \

1882 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \	1973 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \

1883 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \	1974 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \

1884 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \	1975 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \

1885 "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \	1976 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \

1886 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \	1977 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \

1887 "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \	1978 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \

1888 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \	1979 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \

1889 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \	1980 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \

1890 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \	1981 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \

1891 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \	1982 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \

1892 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \	1983 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \

1893 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \	1984 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \

1894 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \	1985 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \

1895 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \	1986 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \

1896 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \	1987 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \

1897 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \	1988 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1977 [v_buf]"+r"(v_buf), // %[v_buf]	2068 [v_buf]"+r"(v_buf), // %[v_buf]

1978 [dst_argb]"+r"(dst_argb), // %[dst_argb]	2069 [dst_argb]"+r"(dst_argb), // %[dst_argb]

1979 [width]"+rm"(width) // %[width]	2070 [width]"+rm"(width) // %[width]

1980 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]	2071 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

1981 : "memory", "cc", NACL_R14	2072 : "memory", "cc", NACL_R14

1982 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"	2073 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

1983 );	2074 );

1984 }	2075 }

1985 #endif // HAS_I422TOARGBROW_AVX2	2076 #endif // HAS_I422TOARGBROW_AVX2

1986	2077

	2078 #if defined(HAS_J422TOARGBROW_AVX2)

	2079 // 16 pixels

	2080 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

	2081 void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,

	2082 const uint8* u_buf,

	2083 const uint8* v_buf,

	2084 uint8* dst_argb,

	2085 int width) {

	2086 asm volatile (

	2087 "sub %[u_buf],%[v_buf] \n"

	2088 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"

	2089 LABELALIGN

	2090 "1: \n"

	2091 READYUV422_AVX2

	2092 YUVTORGB_AVX2(kYuvConstants)

	2093

	2094 // Step 3: Weave into ARGB

	2095 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG

	2096 "vpermq $0xd8,%%ymm0,%%ymm0 \n"

	2097 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA

	2098 "vpermq $0xd8,%%ymm2,%%ymm2 \n"

	2099 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels

	2100 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels

	2101

	2102 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"

	2103 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"

	2104 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"

	2105 "sub $0x10,%[width] \n"

	2106 "jg 1b \n"

	2107 "vzeroupper \n"

	2108 : [y_buf]"+r"(y_buf), // %[y_buf]

	2109 [u_buf]"+r"(u_buf), // %[u_buf]

	2110 [v_buf]"+r"(v_buf), // %[v_buf]

	2111 [dst_argb]"+r"(dst_argb), // %[dst_argb]

	2112 [width]"+rm"(width) // %[width]

	2113 : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]

	2114 : "memory", "cc", NACL_R14

	2115 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

	2116 );

	2117 }

	2118 #endif // HAS_J422TOARGBROW_AVX2

	2119

1987 #if defined(HAS_I422TOABGRROW_AVX2)	2120 #if defined(HAS_I422TOABGRROW_AVX2)

1988 // 16 pixels	2121 // 16 pixels

1989 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).	2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).

1990 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,	2123 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,

1991 const uint8* u_buf,	2124 const uint8* u_buf,

1992 const uint8* v_buf,	2125 const uint8* v_buf,

1993 uint8* dst_argb,	2126 uint8* dst_argb,

1994 int width) {	2127 int width) {

1995 asm volatile (	2128 asm volatile (

1996 "sub %[u_buf],%[v_buf] \n"	2129 "sub %[u_buf],%[v_buf] \n"

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2059 [v_buf]"+r"(v_buf), // %[v_buf]	2192 [v_buf]"+r"(v_buf), // %[v_buf]

2060 [dst_argb]"+r"(dst_argb), // %[dst_argb]	2193 [dst_argb]"+r"(dst_argb), // %[dst_argb]

2061 [width]"+rm"(width) // %[width]	2194 [width]"+rm"(width) // %[width]

2062 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]	2195 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

2063 : "memory", "cc", NACL_R14	2196 : "memory", "cc", NACL_R14

2064 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"	2197 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

2065 );	2198 );

2066 }	2199 }

2067 #endif // HAS_I422TORGBAROW_AVX2	2200 #endif // HAS_I422TORGBAROW_AVX2

2068	2201

2069 #ifdef HAS_YTOARGBROW_SSE2	2202 #ifdef HAS_I400TOARGBROW_SSE2

2070 void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {	2203 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {

2071 asm volatile (	2204 asm volatile (

2072 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164	2205 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164

2073 "movd %%eax,%%xmm2 \n"	2206 "movd %%eax,%%xmm2 \n"

2074 "pshufd $0x0,%%xmm2,%%xmm2 \n"	2207 "pshufd $0x0,%%xmm2,%%xmm2 \n"

2075 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16	2208 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16

2076 "movd %%eax,%%xmm3 \n"	2209 "movd %%eax,%%xmm3 \n"

2077 "pshufd $0x0,%%xmm3,%%xmm3 \n"	2210 "pshufd $0x0,%%xmm3,%%xmm3 \n"

2078 "pcmpeqb %%xmm4,%%xmm4 \n"	2211 "pcmpeqb %%xmm4,%%xmm4 \n"

2079 "pslld $0x18,%%xmm4 \n"	2212 "pslld $0x18,%%xmm4 \n"

2080 LABELALIGN	2213 LABELALIGN

(...skipping 21 matching lines...) Expand all Loading...
2102 "sub $0x8,%2 \n"	2235 "sub $0x8,%2 \n"

2103 "jg 1b \n"	2236 "jg 1b \n"

2104 : "+r"(y_buf), // %0	2237 : "+r"(y_buf), // %0

2105 "+r"(dst_argb), // %1	2238 "+r"(dst_argb), // %1

2106 "+rm"(width) // %2	2239 "+rm"(width) // %2

2107 :	2240 :

2108 : "memory", "cc", "eax"	2241 : "memory", "cc", "eax"

2109 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"	2242 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

2110 );	2243 );

2111 }	2244 }

2112 #endif // HAS_YTOARGBROW_SSE2	2245 #endif // HAS_I400TOARGBROW_SSE2

2113	2246

2114 #ifdef HAS_YTOARGBROW_AVX2	2247 #ifdef HAS_I400TOARGBROW_AVX2

2115 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).	2248 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).

2116 // note: vpunpcklbw mutates and vpackuswb unmutates.	2249 // note: vpunpcklbw mutates and vpackuswb unmutates.

2117 void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {	2250 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {

2118 asm volatile (	2251 asm volatile (

2119 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16	2252 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16

2120 "vmovd %%eax,%%xmm2 \n"	2253 "vmovd %%eax,%%xmm2 \n"

2121 "vbroadcastss %%xmm2,%%ymm2 \n"	2254 "vbroadcastss %%xmm2,%%ymm2 \n"

2122 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164	2255 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164

2123 "vmovd %%eax,%%xmm3 \n"	2256 "vmovd %%eax,%%xmm3 \n"

2124 "vbroadcastss %%xmm3,%%ymm3 \n"	2257 "vbroadcastss %%xmm3,%%ymm3 \n"

2125 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"	2258 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"

2126 "vpslld $0x18,%%ymm4,%%ymm4 \n"	2259 "vpslld $0x18,%%ymm4,%%ymm4 \n"

2127	2260

(...skipping 21 matching lines...) Expand all Loading...
2149 "jg 1b \n"	2282 "jg 1b \n"

2150 "vzeroupper \n"	2283 "vzeroupper \n"

2151 : "+r"(y_buf), // %0	2284 : "+r"(y_buf), // %0

2152 "+r"(dst_argb), // %1	2285 "+r"(dst_argb), // %1

2153 "+rm"(width) // %2	2286 "+rm"(width) // %2

2154 :	2287 :

2155 : "memory", "cc", "eax"	2288 : "memory", "cc", "eax"

2156 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"	2289 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

2157 );	2290 );

2158 }	2291 }

2159 #endif // HAS_YTOARGBROW_AVX2	2292 #endif // HAS_I400TOARGBROW_AVX2

2160	2293

2161 #ifdef HAS_MIRRORROW_SSSE3	2294 #ifdef HAS_MIRRORROW_SSSE3

2162 // Shuffle table for reversing the bytes.	2295 // Shuffle table for reversing the bytes.

2163 static uvec8 kShuffleMirror = {	2296 static uvec8 kShuffleMirror = {

2164 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u	2297 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

2165 };	2298 };

2166	2299

2167 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {	2300 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {

2168 intptr_t temp_width = (intptr_t)(width);	2301 intptr_t temp_width = (intptr_t)(width);

2169 asm volatile (	2302 asm volatile (

(...skipping 919 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3089 uint8* dst_argb, int width) {	3222 uint8* dst_argb, int width) {

3090 asm volatile (	3223 asm volatile (

3091 "pcmpeqb %%xmm7,%%xmm7 \n"	3224 "pcmpeqb %%xmm7,%%xmm7 \n"

3092 "psrlw $0xf,%%xmm7 \n"	3225 "psrlw $0xf,%%xmm7 \n"

3093 "pcmpeqb %%xmm6,%%xmm6 \n"	3226 "pcmpeqb %%xmm6,%%xmm6 \n"

3094 "psrlw $0x8,%%xmm6 \n"	3227 "psrlw $0x8,%%xmm6 \n"

3095 "pcmpeqb %%xmm5,%%xmm5 \n"	3228 "pcmpeqb %%xmm5,%%xmm5 \n"

3096 "psllw $0x8,%%xmm5 \n"	3229 "psllw $0x8,%%xmm5 \n"

3097 "pcmpeqb %%xmm4,%%xmm4 \n"	3230 "pcmpeqb %%xmm4,%%xmm4 \n"

3098 "pslld $0x18,%%xmm4 \n"	3231 "pslld $0x18,%%xmm4 \n"

3099 "sub $0x1,%3 \n"	3232 "sub $0x4,%3 \n"

3100 "je 91f \n"

3101 "jl 99f \n"

3102

3103 // 1 pixel loop until destination pointer is aligned.

3104 "10: \n"

3105 "test $0xf,%2 \n"

3106 "je 19f \n"

3107 "movd " MEMACCESS(0) ",%%xmm3 \n"

3108 "lea " MEMLEA(0x4,0) ",%0 \n"

3109 "movdqa %%xmm3,%%xmm0 \n"

3110 "pxor %%xmm4,%%xmm3 \n"

3111 "movd " MEMACCESS(1) ",%%xmm2 \n"

3112 "psrlw $0x8,%%xmm3 \n"

3113 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"

3114 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"

3115 "pand %%xmm6,%%xmm2 \n"

3116 "paddw %%xmm7,%%xmm3 \n"

3117 "pmullw %%xmm3,%%xmm2 \n"

3118 "movd " MEMACCESS(1) ",%%xmm1 \n"

3119 "lea " MEMLEA(0x4,1) ",%1 \n"

3120 "psrlw $0x8,%%xmm1 \n"

3121 "por %%xmm4,%%xmm0 \n"

3122 "pmullw %%xmm3,%%xmm1 \n"

3123 "psrlw $0x8,%%xmm2 \n"

3124 "paddusb %%xmm2,%%xmm0 \n"

3125 "pand %%xmm5,%%xmm1 \n"

3126 "paddusb %%xmm1,%%xmm0 \n"

3127 "movd %%xmm0," MEMACCESS(2) " \n"

3128 "lea " MEMLEA(0x4,2) ",%2 \n"

3129 "sub $0x1,%3 \n"

3130 "jge 10b \n"

3131

3132 "19: \n"

3133 "add $1-4,%3 \n"

3134 "jl 49f \n"	3233 "jl 49f \n"

3135	3234

3136 // 4 pixel loop.	3235 // 4 pixel loop.

3137 LABELALIGN	3236 LABELALIGN

3138 "41: \n"	3237 "41: \n"

3139 "movdqu " MEMACCESS(0) ",%%xmm3 \n"	3238 "movdqu " MEMACCESS(0) ",%%xmm3 \n"

3140 "lea " MEMLEA(0x10,0) ",%0 \n"	3239 "lea " MEMLEA(0x10,0) ",%0 \n"

3141 "movdqa %%xmm3,%%xmm0 \n"	3240 "movdqa %%xmm3,%%xmm0 \n"

3142 "pxor %%xmm4,%%xmm3 \n"	3241 "pxor %%xmm4,%%xmm3 \n"

3143 "movdqu " MEMACCESS(1) ",%%xmm2 \n"	3242 "movdqu " MEMACCESS(1) ",%%xmm2 \n"

(...skipping 80 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3224 uint8* dst_argb, int width) {	3323 uint8* dst_argb, int width) {

3225 asm volatile (	3324 asm volatile (

3226 "pcmpeqb %%xmm7,%%xmm7 \n"	3325 "pcmpeqb %%xmm7,%%xmm7 \n"

3227 "psrlw $0xf,%%xmm7 \n"	3326 "psrlw $0xf,%%xmm7 \n"

3228 "pcmpeqb %%xmm6,%%xmm6 \n"	3327 "pcmpeqb %%xmm6,%%xmm6 \n"

3229 "psrlw $0x8,%%xmm6 \n"	3328 "psrlw $0x8,%%xmm6 \n"

3230 "pcmpeqb %%xmm5,%%xmm5 \n"	3329 "pcmpeqb %%xmm5,%%xmm5 \n"

3231 "psllw $0x8,%%xmm5 \n"	3330 "psllw $0x8,%%xmm5 \n"

3232 "pcmpeqb %%xmm4,%%xmm4 \n"	3331 "pcmpeqb %%xmm4,%%xmm4 \n"

3233 "pslld $0x18,%%xmm4 \n"	3332 "pslld $0x18,%%xmm4 \n"

3234 "sub $0x1,%3 \n"	3333 "sub $0x4,%3 \n"

3235 "je 91f \n"

3236 "jl 99f \n"

3237

3238 // 1 pixel loop until destination pointer is aligned.

3239 "10: \n"

3240 "test $0xf,%2 \n"

3241 "je 19f \n"

3242 "movd " MEMACCESS(0) ",%%xmm3 \n"

3243 "lea " MEMLEA(0x4,0) ",%0 \n"

3244 "movdqa %%xmm3,%%xmm0 \n"

3245 "pxor %%xmm4,%%xmm3 \n"

3246 "movd " MEMACCESS(1) ",%%xmm2 \n"

3247 "pshufb %4,%%xmm3 \n"

3248 "pand %%xmm6,%%xmm2 \n"

3249 "paddw %%xmm7,%%xmm3 \n"

3250 "pmullw %%xmm3,%%xmm2 \n"

3251 "movd " MEMACCESS(1) ",%%xmm1 \n"

3252 "lea " MEMLEA(0x4,1) ",%1 \n"

3253 "psrlw $0x8,%%xmm1 \n"

3254 "por %%xmm4,%%xmm0 \n"

3255 "pmullw %%xmm3,%%xmm1 \n"

3256 "psrlw $0x8,%%xmm2 \n"

3257 "paddusb %%xmm2,%%xmm0 \n"

3258 "pand %%xmm5,%%xmm1 \n"

3259 "paddusb %%xmm1,%%xmm0 \n"

3260 "movd %%xmm0," MEMACCESS(2) " \n"

3261 "lea " MEMLEA(0x4,2) ",%2 \n"

3262 "sub $0x1,%3 \n"

3263 "jge 10b \n"

3264

3265 "19: \n"

3266 "add $1-4,%3 \n"

3267 "jl 49f \n"	3334 "jl 49f \n"

3268	3335

3269 // 4 pixel loop.	3336 // 4 pixel loop.

3270 LABELALIGN	3337 LABELALIGN

3271 "40: \n"	3338 "40: \n"

3272 "movdqu " MEMACCESS(0) ",%%xmm3 \n"	3339 "movdqu " MEMACCESS(0) ",%%xmm3 \n"

3273 "lea " MEMLEA(0x10,0) ",%0 \n"	3340 "lea " MEMLEA(0x10,0) ",%0 \n"

3274 "movdqa %%xmm3,%%xmm0 \n"	3341 "movdqa %%xmm3,%%xmm0 \n"

3275 "pxor %%xmm4,%%xmm3 \n"	3342 "pxor %%xmm4,%%xmm3 \n"

3276 "movdqu " MEMACCESS(1) ",%%xmm2 \n"	3343 "movdqu " MEMACCESS(1) ",%%xmm2 \n"

(...skipping 1613 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4890 "+r"(src_ptr), // %1	4957 "+r"(src_ptr), // %1

4891 "+r"(dst_width), // %2	4958 "+r"(dst_width), // %2

4892 "+r"(source_y_fraction) // %3	4959 "+r"(source_y_fraction) // %3

4893 : "r"((intptr_t)(src_stride)) // %4	4960 : "r"((intptr_t)(src_stride)) // %4

4894 : "memory", "cc", NACL_R14	4961 : "memory", "cc", NACL_R14

4895 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"	4962 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

4896 );	4963 );

4897 }	4964 }

4898 #endif // HAS_INTERPOLATEROW_SSE2	4965 #endif // HAS_INTERPOLATEROW_SSE2

4899	4966

4900 #ifdef HAS_ARGBTOBAYERGGROW_SSE2

4901 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,

4902 uint32 selector, int pix) {

4903 asm volatile (

4904 "pcmpeqb %%xmm5,%%xmm5 \n"

4905 "psrld $0x18,%%xmm5 \n"

4906 LABELALIGN

4907 "1: \n"

4908 "movdqu " MEMACCESS(0) ",%%xmm0 \n"

4909 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"

4910 "lea " MEMLEA(0x20,0) ",%0 \n"

4911 "psrld $0x8,%%xmm0 \n"

4912 "psrld $0x8,%%xmm1 \n"

4913 "pand %%xmm5,%%xmm0 \n"

4914 "pand %%xmm5,%%xmm1 \n"

4915 "packssdw %%xmm1,%%xmm0 \n"

4916 "packuswb %%xmm1,%%xmm0 \n"

4917 "movq %%xmm0," MEMACCESS(1) " \n"

4918 "lea " MEMLEA(0x8,1) ",%1 \n"

4919 "sub $0x8,%2 \n"

4920 "jg 1b \n"

4921 : "+r"(src_argb), // %0

4922 "+r"(dst_bayer), // %1

4923 "+r"(pix) // %2

4924 :

4925 : "memory", "cc"

4926 , "xmm0", "xmm1", "xmm5"

4927 );

4928 }

4929 #endif // HAS_ARGBTOBAYERGGROW_SSE2

4930

4931 #ifdef HAS_ARGBSHUFFLEROW_SSSE3	4967 #ifdef HAS_ARGBSHUFFLEROW_SSSE3

4932 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.	4968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

4933 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,	4969 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

4934 const uint8* shuffler, int pix) {	4970 const uint8* shuffler, int pix) {

4935 asm volatile (	4971 asm volatile (

4936 "movdqu " MEMACCESS(3) ",%%xmm5 \n"	4972 "movdqu " MEMACCESS(3) ",%%xmm5 \n"

4937 LABELALIGN	4973 LABELALIGN

4938 "1: \n"	4974 "1: \n"

4939 "movdqu " MEMACCESS(0) ",%%xmm0 \n"	4975 "movdqu " MEMACCESS(0) ",%%xmm0 \n"

4940 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"	4976 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"

(...skipping 489 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5430 );	5466 );

5431 }	5467 }

5432 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3	5468 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3

5433	5469

5434 #endif // defined(__x86_64__) \|\| defined(__i386__)	5470 #endif // defined(__x86_64__) \|\| defined(__i386__)

5435	5471

5436 #ifdef __cplusplus	5472 #ifdef __cplusplus

5437 } // extern "C"	5473 } // extern "C"

5438 } // namespace libyuv	5474 } // namespace libyuv

5439 #endif	5475 #endif

OLD	NEW

« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_common.cc ('k') | source/libvpx/third_party/libyuv/source/row_neon.cc » ('j') | no next file with comments »