Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(398)

Side by Side Diff: source/row_win.cc

Issue 1702373004: Port ARGBToJ420 to AVX2 (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: fix lint warnings for spacing in row_gcc.cc Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 1487 matching lines...) Expand 10 before | Expand all | Expand 10 after
1498 // from here down is very similar to Y code except 1498 // from here down is very similar to Y code except
1499 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1499 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1500 movdqa xmm1, xmm0 1500 movdqa xmm1, xmm0
1501 movdqa xmm3, xmm2 1501 movdqa xmm3, xmm2
1502 pmaddubsw xmm0, xmm7 // U 1502 pmaddubsw xmm0, xmm7 // U
1503 pmaddubsw xmm2, xmm7 1503 pmaddubsw xmm2, xmm7
1504 pmaddubsw xmm1, xmm6 // V 1504 pmaddubsw xmm1, xmm6 // V
1505 pmaddubsw xmm3, xmm6 1505 pmaddubsw xmm3, xmm6
1506 phaddw xmm0, xmm2 1506 phaddw xmm0, xmm2
1507 phaddw xmm1, xmm3 1507 phaddw xmm1, xmm3
1508 paddw xmm0, xmm5 // +.5 rounding -> unsigned 1508 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1509 paddw xmm1, xmm5 1509 paddw xmm1, xmm5
1510 psraw xmm0, 8 1510 psraw xmm0, 8
1511 psraw xmm1, 8 1511 psraw xmm1, 8
1512 packsswb xmm0, xmm1 1512 packsswb xmm0, xmm1
1513 1513
1514 // step 3 - store 8 U and 8 V values 1514 // step 3 - store 8 U and 8 V values
1515 movlps qword ptr [edx], xmm0 // U 1515 movlps qword ptr [edx], xmm0 // U
1516 movhps qword ptr [edx + edi], xmm0 // V 1516 movhps qword ptr [edx + edi], xmm0 // V
1517 lea edx, [edx + 8] 1517 lea edx, [edx + 8]
1518 sub ecx, 16 1518 sub ecx, 16
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
1583 jg convertloop 1583 jg convertloop
1584 1584
1585 pop edi 1585 pop edi
1586 pop esi 1586 pop esi
1587 vzeroupper 1587 vzeroupper
1588 ret 1588 ret
1589 } 1589 }
1590 } 1590 }
1591 #endif // HAS_ARGBTOUVROW_AVX2 1591 #endif // HAS_ARGBTOUVROW_AVX2
1592 1592
1593 #ifdef HAS_ARGBTOUVJROW_AVX2
1594 __declspec(naked)
1595 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1596 uint8* dst_u, uint8* dst_v, int width) {
1597 __asm {
1598 push esi
1599 push edi
1600 mov eax, [esp + 8 + 4] // src_argb
1601 mov esi, [esp + 8 + 8] // src_stride_argb
1602 mov edx, [esp + 8 + 12] // dst_u
1603 mov edi, [esp + 8 + 16] // dst_v
1604 mov ecx, [esp + 8 + 20] // width
1605 vbroadcastf128 ymm5, xmmword ptr kAddUV128
1606 vbroadcastf128 ymm6, xmmword ptr kARGBToV
1607 vbroadcastf128 ymm7, xmmword ptr kARGBToU
1608 sub edi, edx // stride from u to v
1609
1610 convertloop:
1611 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1612 vmovdqu ymm0, [eax]
1613 vmovdqu ymm1, [eax + 32]
1614 vmovdqu ymm2, [eax + 64]
1615 vmovdqu ymm3, [eax + 96]
1616 vpavgb ymm0, ymm0, [eax + esi]
1617 vpavgb ymm1, ymm1, [eax + esi + 32]
1618 vpavgb ymm2, ymm2, [eax + esi + 64]
1619 vpavgb ymm3, ymm3, [eax + esi + 96]
1620 lea eax, [eax + 128]
1621 vshufps ymm4, ymm0, ymm1, 0x88
1622 vshufps ymm0, ymm0, ymm1, 0xdd
1623 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1624 vshufps ymm4, ymm2, ymm3, 0x88
1625 vshufps ymm2, ymm2, ymm3, 0xdd
1626 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1627
1628 // step 2 - convert to U and V
1629 // from here down is very similar to Y code except
1630 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1631 vpmaddubsw ymm1, ymm0, ymm7 // U
1632 vpmaddubsw ymm3, ymm2, ymm7
1633 vpmaddubsw ymm0, ymm0, ymm6 // V
1634 vpmaddubsw ymm2, ymm2, ymm6
1635 vphaddw ymm1, ymm1, ymm3 // mutates
1636 vphaddw ymm0, ymm0, ymm2
1637 vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
1638 vpaddw ymm0, ymm0, ymm5
1639 vpsraw ymm1, ymm1, 8
1640 vpsraw ymm0, ymm0, 8
1641 vpacksswb ymm0, ymm1, ymm0 // mutates
1642 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1643 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1644
1645 // step 3 - store 16 U and 16 V values
1646 vextractf128 [edx], ymm0, 0 // U
1647 vextractf128 [edx + edi], ymm0, 1 // V
1648 lea edx, [edx + 16]
1649 sub ecx, 32
1650 jg convertloop
1651
1652 pop edi
1653 pop esi
1654 vzeroupper
1655 ret
1656 }
1657 }
1658 #endif // HAS_ARGBTOUVJROW_AVX2
1659
1593 __declspec(naked) 1660 __declspec(naked)
1594 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, 1661 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1595 uint8* dst_u, uint8* dst_v, int width) { 1662 uint8* dst_u, uint8* dst_v, int width) {
1596 __asm { 1663 __asm {
1597 push edi 1664 push edi
1598 mov eax, [esp + 4 + 4] // src_argb 1665 mov eax, [esp + 4 + 4] // src_argb
1599 mov edx, [esp + 4 + 8] // dst_u 1666 mov edx, [esp + 4 + 8] // dst_u
1600 mov edi, [esp + 4 + 12] // dst_v 1667 mov edi, [esp + 4 + 12] // dst_v
1601 mov ecx, [esp + 4 + 16] // width 1668 mov ecx, [esp + 4 + 16] // width
1602 movdqa xmm5, xmmword ptr kAddUV128 1669 movdqa xmm5, xmmword ptr kAddUV128
(...skipping 4562 matching lines...) Expand 10 before | Expand all | Expand 10 after
6165 } 6232 }
6166 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6233 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6167 6234
6168 #endif // defined(_M_X64) 6235 #endif // defined(_M_X64)
6169 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6236 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6170 6237
6171 #ifdef __cplusplus 6238 #ifdef __cplusplus
6172 } // extern "C" 6239 } // extern "C"
6173 } // namespace libyuv 6240 } // namespace libyuv
6174 #endif 6241 #endif
OLDNEW
« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698