OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 1487 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1498 // from here down is very similar to Y code except | 1498 // from here down is very similar to Y code except |
1499 // instead of 16 different pixels, its 8 pixels of U and 8 of V | 1499 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
1500 movdqa xmm1, xmm0 | 1500 movdqa xmm1, xmm0 |
1501 movdqa xmm3, xmm2 | 1501 movdqa xmm3, xmm2 |
1502 pmaddubsw xmm0, xmm7 // U | 1502 pmaddubsw xmm0, xmm7 // U |
1503 pmaddubsw xmm2, xmm7 | 1503 pmaddubsw xmm2, xmm7 |
1504 pmaddubsw xmm1, xmm6 // V | 1504 pmaddubsw xmm1, xmm6 // V |
1505 pmaddubsw xmm3, xmm6 | 1505 pmaddubsw xmm3, xmm6 |
1506 phaddw xmm0, xmm2 | 1506 phaddw xmm0, xmm2 |
1507 phaddw xmm1, xmm3 | 1507 phaddw xmm1, xmm3 |
1508 paddw xmm0, xmm5 // +.5 rounding -> unsigned | 1508 paddw xmm0, xmm5 // +.5 rounding -> unsigned |
1509 paddw xmm1, xmm5 | 1509 paddw xmm1, xmm5 |
1510 psraw xmm0, 8 | 1510 psraw xmm0, 8 |
1511 psraw xmm1, 8 | 1511 psraw xmm1, 8 |
1512 packsswb xmm0, xmm1 | 1512 packsswb xmm0, xmm1 |
1513 | 1513 |
1514 // step 3 - store 8 U and 8 V values | 1514 // step 3 - store 8 U and 8 V values |
1515 movlps qword ptr [edx], xmm0 // U | 1515 movlps qword ptr [edx], xmm0 // U |
1516 movhps qword ptr [edx + edi], xmm0 // V | 1516 movhps qword ptr [edx + edi], xmm0 // V |
1517 lea edx, [edx + 8] | 1517 lea edx, [edx + 8] |
1518 sub ecx, 16 | 1518 sub ecx, 16 |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1583 jg convertloop | 1583 jg convertloop |
1584 | 1584 |
1585 pop edi | 1585 pop edi |
1586 pop esi | 1586 pop esi |
1587 vzeroupper | 1587 vzeroupper |
1588 ret | 1588 ret |
1589 } | 1589 } |
1590 } | 1590 } |
1591 #endif // HAS_ARGBTOUVROW_AVX2 | 1591 #endif // HAS_ARGBTOUVROW_AVX2 |
1592 | 1592 |
| 1593 #ifdef HAS_ARGBTOUVJROW_AVX2 |
| 1594 __declspec(naked) |
| 1595 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
| 1596 uint8* dst_u, uint8* dst_v, int width) { |
| 1597 __asm { |
| 1598 push esi |
| 1599 push edi |
| 1600 mov eax, [esp + 8 + 4] // src_argb |
| 1601 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1602 mov edx, [esp + 8 + 12] // dst_u |
| 1603 mov edi, [esp + 8 + 16] // dst_v |
| 1604 mov ecx, [esp + 8 + 20] // width |
| 1605 vbroadcastf128 ymm5, xmmword ptr kAddUV128 |
| 1606 vbroadcastf128 ymm6, xmmword ptr kARGBToV |
| 1607 vbroadcastf128 ymm7, xmmword ptr kARGBToU |
| 1608 sub edi, edx // stride from u to v |
| 1609 |
| 1610 convertloop: |
| 1611 /* step 1 - subsample 32x2 argb pixels to 16x1 */ |
| 1612 vmovdqu ymm0, [eax] |
| 1613 vmovdqu ymm1, [eax + 32] |
| 1614 vmovdqu ymm2, [eax + 64] |
| 1615 vmovdqu ymm3, [eax + 96] |
| 1616 vpavgb ymm0, ymm0, [eax + esi] |
| 1617 vpavgb ymm1, ymm1, [eax + esi + 32] |
| 1618 vpavgb ymm2, ymm2, [eax + esi + 64] |
| 1619 vpavgb ymm3, ymm3, [eax + esi + 96] |
| 1620 lea eax, [eax + 128] |
| 1621 vshufps ymm4, ymm0, ymm1, 0x88 |
| 1622 vshufps ymm0, ymm0, ymm1, 0xdd |
| 1623 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps |
| 1624 vshufps ymm4, ymm2, ymm3, 0x88 |
| 1625 vshufps ymm2, ymm2, ymm3, 0xdd |
| 1626 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps |
| 1627 |
| 1628 // step 2 - convert to U and V |
| 1629 // from here down is very similar to Y code except |
| 1630 // instead of 32 different pixels, its 16 pixels of U and 16 of V |
| 1631 vpmaddubsw ymm1, ymm0, ymm7 // U |
| 1632 vpmaddubsw ymm3, ymm2, ymm7 |
| 1633 vpmaddubsw ymm0, ymm0, ymm6 // V |
| 1634 vpmaddubsw ymm2, ymm2, ymm6 |
| 1635 vphaddw ymm1, ymm1, ymm3 // mutates |
| 1636 vphaddw ymm0, ymm0, ymm2 |
| 1637 vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned |
| 1638 vpaddw ymm0, ymm0, ymm5 |
| 1639 vpsraw ymm1, ymm1, 8 |
| 1640 vpsraw ymm0, ymm0, 8 |
| 1641 vpacksswb ymm0, ymm1, ymm0 // mutates |
| 1642 vpermq ymm0, ymm0, 0xd8 // For vpacksswb |
| 1643 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw |
| 1644 |
| 1645 // step 3 - store 16 U and 16 V values |
| 1646 vextractf128 [edx], ymm0, 0 // U |
| 1647 vextractf128 [edx + edi], ymm0, 1 // V |
| 1648 lea edx, [edx + 16] |
| 1649 sub ecx, 32 |
| 1650 jg convertloop |
| 1651 |
| 1652 pop edi |
| 1653 pop esi |
| 1654 vzeroupper |
| 1655 ret |
| 1656 } |
| 1657 } |
| 1658 #endif // HAS_ARGBTOUVJROW_AVX2 |
| 1659 |
1593 __declspec(naked) | 1660 __declspec(naked) |
1594 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, | 1661 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, |
1595 uint8* dst_u, uint8* dst_v, int width) { | 1662 uint8* dst_u, uint8* dst_v, int width) { |
1596 __asm { | 1663 __asm { |
1597 push edi | 1664 push edi |
1598 mov eax, [esp + 4 + 4] // src_argb | 1665 mov eax, [esp + 4 + 4] // src_argb |
1599 mov edx, [esp + 4 + 8] // dst_u | 1666 mov edx, [esp + 4 + 8] // dst_u |
1600 mov edi, [esp + 4 + 12] // dst_v | 1667 mov edi, [esp + 4 + 12] // dst_v |
1601 mov ecx, [esp + 4 + 16] // width | 1668 mov ecx, [esp + 4 + 16] // width |
1602 movdqa xmm5, xmmword ptr kAddUV128 | 1669 movdqa xmm5, xmmword ptr kAddUV128 |
(...skipping 4562 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6165 } | 6232 } |
6166 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6233 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6167 | 6234 |
6168 #endif // defined(_M_X64) | 6235 #endif // defined(_M_X64) |
6169 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6236 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6170 | 6237 |
6171 #ifdef __cplusplus | 6238 #ifdef __cplusplus |
6172 } // extern "C" | 6239 } // extern "C" |
6173 } // namespace libyuv | 6240 } // namespace libyuv |
6174 #endif | 6241 #endif |
OLD | NEW |