OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
77 #define STOREARGB \ | 77 #define STOREARGB \ |
78 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ | 78 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ |
79 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ | 79 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ |
80 xmm1 = _mm_loadu_si128(&xmm0); \ | 80 xmm1 = _mm_loadu_si128(&xmm0); \ |
81 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ | 81 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ |
82 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ | 82 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ |
83 _mm_storeu_si128((__m128i *)dst_argb, xmm0); \ | 83 _mm_storeu_si128((__m128i *)dst_argb, xmm0); \ |
84 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ | 84 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ |
85 dst_argb += 32; | 85 dst_argb += 32; |
86 | 86 |
87 // Store 8 ABGR values. | |
88 #define STOREABGR \ | |
89 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ | |
90 xmm0 = _mm_unpacklo_epi8(xmm0, xmm5); \ | |
91 xmm1 = _mm_loadu_si128(&xmm2); \ | |
92 xmm2 = _mm_unpacklo_epi16(xmm2, xmm0); \ | |
93 xmm1 = _mm_unpackhi_epi16(xmm1, xmm0); \ | |
94 _mm_storeu_si128((__m128i *)dst_abgr, xmm2); \ | |
95 _mm_storeu_si128((__m128i *)(dst_abgr + 16), xmm1); \ | |
96 dst_abgr += 32; | |
97 | |
98 | 87 |
99 #if defined(HAS_I422TOARGBROW_SSSE3) | 88 #if defined(HAS_I422TOARGBROW_SSSE3) |
100 void I422ToARGBRow_SSSE3(const uint8* y_buf, | 89 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
101 const uint8* u_buf, | 90 const uint8* u_buf, |
102 const uint8* v_buf, | 91 const uint8* v_buf, |
103 uint8* dst_argb, | 92 uint8* dst_argb, |
104 const struct YuvConstants* yuvconstants, | 93 const struct YuvConstants* yuvconstants, |
105 int width) { | 94 int width) { |
106 __m128i xmm0, xmm1, xmm2, xmm4; | 95 __m128i xmm0, xmm1, xmm2, xmm4; |
107 const __m128i xmm5 = _mm_set1_epi8(-1); | 96 const __m128i xmm5 = _mm_set1_epi8(-1); |
(...skipping 1910 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2018 __asm vpermq ymm0, ymm0, 0xd8 \ | 2007 __asm vpermq ymm0, ymm0, 0xd8 \ |
2019 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ | 2008 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ |
2020 __asm vpermq ymm2, ymm2, 0xd8 \ | 2009 __asm vpermq ymm2, ymm2, 0xd8 \ |
2021 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ | 2010 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ |
2022 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ | 2011 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ |
2023 __asm vmovdqu 0[edx], ymm1 \ | 2012 __asm vmovdqu 0[edx], ymm1 \ |
2024 __asm vmovdqu 32[edx], ymm0 \ | 2013 __asm vmovdqu 32[edx], ymm0 \ |
2025 __asm lea edx, [edx + 64] \ | 2014 __asm lea edx, [edx + 64] \ |
2026 } | 2015 } |
2027 | 2016 |
2028 // Store 16 ABGR values. | |
2029 #define STOREBGRA_AVX2 __asm { \ | |
2030 __asm vpunpcklbw ymm1, ymm1, ymm0 /* GB */ \ | |
2031 __asm vpermq ymm1, ymm1, 0xd8 \ | |
2032 __asm vpunpcklbw ymm2, ymm5, ymm2 /* AR */ \ | |
2033 __asm vpermq ymm2, ymm2, 0xd8 \ | |
2034 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ARGB first 8 pixels */ \ | |
2035 __asm vpunpckhwd ymm2, ymm2, ymm1 /* ARGB next 8 pixels */ \ | |
2036 __asm vmovdqu [edx], ymm0 \ | |
2037 __asm vmovdqu [edx + 32], ymm2 \ | |
2038 __asm lea edx, [edx + 64] \ | |
2039 } | |
2040 | |
2041 // Store 16 RGBA values. | 2017 // Store 16 RGBA values. |
2042 #define STORERGBA_AVX2 __asm { \ | 2018 #define STORERGBA_AVX2 __asm { \ |
2043 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ | 2019 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ |
2044 __asm vpermq ymm1, ymm1, 0xd8 \ | 2020 __asm vpermq ymm1, ymm1, 0xd8 \ |
2045 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ | 2021 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ |
2046 __asm vpermq ymm2, ymm2, 0xd8 \ | 2022 __asm vpermq ymm2, ymm2, 0xd8 \ |
2047 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ | 2023 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ |
2048 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ | 2024 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ |
2049 __asm vmovdqu [edx], ymm0 \ | 2025 __asm vmovdqu [edx], ymm0 \ |
2050 __asm vmovdqu [edx + 32], ymm1 \ | 2026 __asm vmovdqu [edx + 32], ymm1 \ |
2051 __asm lea edx, [edx + 64] \ | 2027 __asm lea edx, [edx + 64] \ |
2052 } | 2028 } |
2053 | 2029 |
2054 // Store 16 ABGR values. | |
2055 #define STOREABGR_AVX2 __asm { \ | |
2056 __asm vpunpcklbw ymm1, ymm2, ymm1 /* RG */ \ | |
2057 __asm vpermq ymm1, ymm1, 0xd8 \ | |
2058 __asm vpunpcklbw ymm2, ymm0, ymm5 /* BA */ \ | |
2059 __asm vpermq ymm2, ymm2, 0xd8 \ | |
2060 __asm vpunpcklwd ymm0, ymm1, ymm2 /* RGBA first 8 pixels */ \ | |
2061 __asm vpunpckhwd ymm1, ymm1, ymm2 /* RGBA next 8 pixels */ \ | |
2062 __asm vmovdqu [edx], ymm0 \ | |
2063 __asm vmovdqu [edx + 32], ymm1 \ | |
2064 __asm lea edx, [edx + 64] \ | |
2065 } | |
2066 | |
2067 #ifdef HAS_I422TOARGBROW_AVX2 | 2030 #ifdef HAS_I422TOARGBROW_AVX2 |
2068 // 16 pixels | 2031 // 16 pixels |
2069 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2032 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2070 __declspec(naked) | 2033 __declspec(naked) |
2071 void I422ToARGBRow_AVX2(const uint8* y_buf, | 2034 void I422ToARGBRow_AVX2(const uint8* y_buf, |
2072 const uint8* u_buf, | 2035 const uint8* u_buf, |
2073 const uint8* v_buf, | 2036 const uint8* v_buf, |
2074 uint8* dst_argb, | 2037 uint8* dst_argb, |
2075 const struct YuvConstants* yuvconstants, | 2038 const struct YuvConstants* yuvconstants, |
2076 int width) { | 2039 int width) { |
(...skipping 453 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2530 __asm punpcklbw xmm1, xmm0 /* GB */ \ | 2493 __asm punpcklbw xmm1, xmm0 /* GB */ \ |
2531 __asm punpcklbw xmm5, xmm2 /* AR */ \ | 2494 __asm punpcklbw xmm5, xmm2 /* AR */ \ |
2532 __asm movdqa xmm0, xmm5 \ | 2495 __asm movdqa xmm0, xmm5 \ |
2533 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ | 2496 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ |
2534 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ | 2497 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ |
2535 __asm movdqu 0[edx], xmm5 \ | 2498 __asm movdqu 0[edx], xmm5 \ |
2536 __asm movdqu 16[edx], xmm0 \ | 2499 __asm movdqu 16[edx], xmm0 \ |
2537 __asm lea edx, [edx + 32] \ | 2500 __asm lea edx, [edx + 32] \ |
2538 } | 2501 } |
2539 | 2502 |
2540 // Store 8 ABGR values. | |
2541 #define STOREABGR __asm { \ | |
2542 __asm punpcklbw xmm2, xmm1 /* RG */ \ | |
2543 __asm punpcklbw xmm0, xmm5 /* BA */ \ | |
2544 __asm movdqa xmm1, xmm2 \ | |
2545 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ | |
2546 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ | |
2547 __asm movdqu 0[edx], xmm2 \ | |
2548 __asm movdqu 16[edx], xmm1 \ | |
2549 __asm lea edx, [edx + 32] \ | |
2550 } | |
2551 | |
2552 // Store 8 RGBA values. | 2503 // Store 8 RGBA values. |
2553 #define STORERGBA __asm { \ | 2504 #define STORERGBA __asm { \ |
2554 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ | 2505 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ |
2555 __asm punpcklbw xmm1, xmm2 /* GR */ \ | 2506 __asm punpcklbw xmm1, xmm2 /* GR */ \ |
2556 __asm punpcklbw xmm5, xmm0 /* AB */ \ | 2507 __asm punpcklbw xmm5, xmm0 /* AB */ \ |
2557 __asm movdqa xmm0, xmm5 \ | 2508 __asm movdqa xmm0, xmm5 \ |
2558 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ | 2509 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ |
2559 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ | 2510 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ |
2560 __asm movdqu 0[edx], xmm5 \ | 2511 __asm movdqu 0[edx], xmm5 \ |
2561 __asm movdqu 16[edx], xmm0 \ | 2512 __asm movdqu 16[edx], xmm0 \ |
(...skipping 10 matching lines...) Expand all Loading... |
2572 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ | 2523 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
2573 /* RRGB -> RGB24 */ \ | 2524 /* RRGB -> RGB24 */ \ |
2574 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ | 2525 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ |
2575 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ | 2526 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ |
2576 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ | 2527 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ |
2577 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ | 2528 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ |
2578 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ | 2529 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ |
2579 __asm lea edx, [edx + 24] \ | 2530 __asm lea edx, [edx + 24] \ |
2580 } | 2531 } |
2581 | 2532 |
2582 // Store 8 RAW values. | |
2583 #define STORERAW __asm { \ | |
2584 /* Weave into RRGB */ \ | |
2585 __asm punpcklbw xmm0, xmm1 /* BG */ \ | |
2586 __asm punpcklbw xmm2, xmm2 /* RR */ \ | |
2587 __asm movdqa xmm1, xmm0 \ | |
2588 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ | |
2589 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ | |
2590 /* Step 4: RRGB -> RAW */ \ | |
2591 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ | |
2592 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ | |
2593 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ | |
2594 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ | |
2595 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ | |
2596 __asm lea edx, [edx + 24] \ | |
2597 } | |
2598 | |
2599 // Store 8 RGB565 values. | 2533 // Store 8 RGB565 values. |
2600 #define STORERGB565 __asm { \ | 2534 #define STORERGB565 __asm { \ |
2601 /* Weave into RRGB */ \ | 2535 /* Weave into RRGB */ \ |
2602 __asm punpcklbw xmm0, xmm1 /* BG */ \ | 2536 __asm punpcklbw xmm0, xmm1 /* BG */ \ |
2603 __asm punpcklbw xmm2, xmm2 /* RR */ \ | 2537 __asm punpcklbw xmm2, xmm2 /* RR */ \ |
2604 __asm movdqa xmm1, xmm0 \ | 2538 __asm movdqa xmm1, xmm0 \ |
2605 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ | 2539 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
2606 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ | 2540 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
2607 /* RRGB -> RGB565 */ \ | 2541 /* RRGB -> RGB565 */ \ |
2608 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ | 2542 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ |
(...skipping 3644 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6253 } | 6187 } |
6254 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6188 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6255 | 6189 |
6256 #endif // defined(_M_X64) | 6190 #endif // defined(_M_X64) |
6257 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6191 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6258 | 6192 |
6259 #ifdef __cplusplus | 6193 #ifdef __cplusplus |
6260 } // extern "C" | 6194 } // extern "C" |
6261 } // namespace libyuv | 6195 } // namespace libyuv |
6262 #endif | 6196 #endif |
OLD | NEW |