| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 77 #define STOREARGB \ | 77 #define STOREARGB \ |
| 78 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ | 78 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ |
| 79 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ | 79 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ |
| 80 xmm1 = _mm_loadu_si128(&xmm0); \ | 80 xmm1 = _mm_loadu_si128(&xmm0); \ |
| 81 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ | 81 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ |
| 82 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ | 82 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ |
| 83 _mm_storeu_si128((__m128i *)dst_argb, xmm0); \ | 83 _mm_storeu_si128((__m128i *)dst_argb, xmm0); \ |
| 84 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ | 84 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ |
| 85 dst_argb += 32; | 85 dst_argb += 32; |
| 86 | 86 |
| 87 // Store 8 ABGR values. | |
| 88 #define STOREABGR \ | |
| 89 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ | |
| 90 xmm0 = _mm_unpacklo_epi8(xmm0, xmm5); \ | |
| 91 xmm1 = _mm_loadu_si128(&xmm2); \ | |
| 92 xmm2 = _mm_unpacklo_epi16(xmm2, xmm0); \ | |
| 93 xmm1 = _mm_unpackhi_epi16(xmm1, xmm0); \ | |
| 94 _mm_storeu_si128((__m128i *)dst_abgr, xmm2); \ | |
| 95 _mm_storeu_si128((__m128i *)(dst_abgr + 16), xmm1); \ | |
| 96 dst_abgr += 32; | |
| 97 | |
| 98 | 87 |
| 99 #if defined(HAS_I422TOARGBROW_SSSE3) | 88 #if defined(HAS_I422TOARGBROW_SSSE3) |
| 100 void I422ToARGBRow_SSSE3(const uint8* y_buf, | 89 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
| 101 const uint8* u_buf, | 90 const uint8* u_buf, |
| 102 const uint8* v_buf, | 91 const uint8* v_buf, |
| 103 uint8* dst_argb, | 92 uint8* dst_argb, |
| 104 const struct YuvConstants* yuvconstants, | 93 const struct YuvConstants* yuvconstants, |
| 105 int width) { | 94 int width) { |
| 106 __m128i xmm0, xmm1, xmm2, xmm4; | 95 __m128i xmm0, xmm1, xmm2, xmm4; |
| 107 const __m128i xmm5 = _mm_set1_epi8(-1); | 96 const __m128i xmm5 = _mm_set1_epi8(-1); |
| (...skipping 1910 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2018 __asm vpermq ymm0, ymm0, 0xd8 \ | 2007 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 2019 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ | 2008 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ |
| 2020 __asm vpermq ymm2, ymm2, 0xd8 \ | 2009 __asm vpermq ymm2, ymm2, 0xd8 \ |
| 2021 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ | 2010 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ |
| 2022 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ | 2011 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ |
| 2023 __asm vmovdqu 0[edx], ymm1 \ | 2012 __asm vmovdqu 0[edx], ymm1 \ |
| 2024 __asm vmovdqu 32[edx], ymm0 \ | 2013 __asm vmovdqu 32[edx], ymm0 \ |
| 2025 __asm lea edx, [edx + 64] \ | 2014 __asm lea edx, [edx + 64] \ |
| 2026 } | 2015 } |
| 2027 | 2016 |
| 2028 // Store 16 ABGR values. | |
| 2029 #define STOREBGRA_AVX2 __asm { \ | |
| 2030 __asm vpunpcklbw ymm1, ymm1, ymm0 /* GB */ \ | |
| 2031 __asm vpermq ymm1, ymm1, 0xd8 \ | |
| 2032 __asm vpunpcklbw ymm2, ymm5, ymm2 /* AR */ \ | |
| 2033 __asm vpermq ymm2, ymm2, 0xd8 \ | |
| 2034 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ARGB first 8 pixels */ \ | |
| 2035 __asm vpunpckhwd ymm2, ymm2, ymm1 /* ARGB next 8 pixels */ \ | |
| 2036 __asm vmovdqu [edx], ymm0 \ | |
| 2037 __asm vmovdqu [edx + 32], ymm2 \ | |
| 2038 __asm lea edx, [edx + 64] \ | |
| 2039 } | |
| 2040 | |
| 2041 // Store 16 RGBA values. | 2017 // Store 16 RGBA values. |
| 2042 #define STORERGBA_AVX2 __asm { \ | 2018 #define STORERGBA_AVX2 __asm { \ |
| 2043 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ | 2019 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ |
| 2044 __asm vpermq ymm1, ymm1, 0xd8 \ | 2020 __asm vpermq ymm1, ymm1, 0xd8 \ |
| 2045 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ | 2021 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ |
| 2046 __asm vpermq ymm2, ymm2, 0xd8 \ | 2022 __asm vpermq ymm2, ymm2, 0xd8 \ |
| 2047 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ | 2023 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ |
| 2048 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ | 2024 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ |
| 2049 __asm vmovdqu [edx], ymm0 \ | 2025 __asm vmovdqu [edx], ymm0 \ |
| 2050 __asm vmovdqu [edx + 32], ymm1 \ | 2026 __asm vmovdqu [edx + 32], ymm1 \ |
| 2051 __asm lea edx, [edx + 64] \ | 2027 __asm lea edx, [edx + 64] \ |
| 2052 } | 2028 } |
| 2053 | 2029 |
| 2054 // Store 16 ABGR values. | |
| 2055 #define STOREABGR_AVX2 __asm { \ | |
| 2056 __asm vpunpcklbw ymm1, ymm2, ymm1 /* RG */ \ | |
| 2057 __asm vpermq ymm1, ymm1, 0xd8 \ | |
| 2058 __asm vpunpcklbw ymm2, ymm0, ymm5 /* BA */ \ | |
| 2059 __asm vpermq ymm2, ymm2, 0xd8 \ | |
| 2060 __asm vpunpcklwd ymm0, ymm1, ymm2 /* RGBA first 8 pixels */ \ | |
| 2061 __asm vpunpckhwd ymm1, ymm1, ymm2 /* RGBA next 8 pixels */ \ | |
| 2062 __asm vmovdqu [edx], ymm0 \ | |
| 2063 __asm vmovdqu [edx + 32], ymm1 \ | |
| 2064 __asm lea edx, [edx + 64] \ | |
| 2065 } | |
| 2066 | |
| 2067 #ifdef HAS_I422TOARGBROW_AVX2 | 2030 #ifdef HAS_I422TOARGBROW_AVX2 |
| 2068 // 16 pixels | 2031 // 16 pixels |
| 2069 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2032 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2070 __declspec(naked) | 2033 __declspec(naked) |
| 2071 void I422ToARGBRow_AVX2(const uint8* y_buf, | 2034 void I422ToARGBRow_AVX2(const uint8* y_buf, |
| 2072 const uint8* u_buf, | 2035 const uint8* u_buf, |
| 2073 const uint8* v_buf, | 2036 const uint8* v_buf, |
| 2074 uint8* dst_argb, | 2037 uint8* dst_argb, |
| 2075 const struct YuvConstants* yuvconstants, | 2038 const struct YuvConstants* yuvconstants, |
| 2076 int width) { | 2039 int width) { |
| (...skipping 453 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2530 __asm punpcklbw xmm1, xmm0 /* GB */ \ | 2493 __asm punpcklbw xmm1, xmm0 /* GB */ \ |
| 2531 __asm punpcklbw xmm5, xmm2 /* AR */ \ | 2494 __asm punpcklbw xmm5, xmm2 /* AR */ \ |
| 2532 __asm movdqa xmm0, xmm5 \ | 2495 __asm movdqa xmm0, xmm5 \ |
| 2533 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ | 2496 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ |
| 2534 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ | 2497 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ |
| 2535 __asm movdqu 0[edx], xmm5 \ | 2498 __asm movdqu 0[edx], xmm5 \ |
| 2536 __asm movdqu 16[edx], xmm0 \ | 2499 __asm movdqu 16[edx], xmm0 \ |
| 2537 __asm lea edx, [edx + 32] \ | 2500 __asm lea edx, [edx + 32] \ |
| 2538 } | 2501 } |
| 2539 | 2502 |
| 2540 // Store 8 ABGR values. | |
| 2541 #define STOREABGR __asm { \ | |
| 2542 __asm punpcklbw xmm2, xmm1 /* RG */ \ | |
| 2543 __asm punpcklbw xmm0, xmm5 /* BA */ \ | |
| 2544 __asm movdqa xmm1, xmm2 \ | |
| 2545 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ | |
| 2546 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ | |
| 2547 __asm movdqu 0[edx], xmm2 \ | |
| 2548 __asm movdqu 16[edx], xmm1 \ | |
| 2549 __asm lea edx, [edx + 32] \ | |
| 2550 } | |
| 2551 | |
| 2552 // Store 8 RGBA values. | 2503 // Store 8 RGBA values. |
| 2553 #define STORERGBA __asm { \ | 2504 #define STORERGBA __asm { \ |
| 2554 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ | 2505 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ |
| 2555 __asm punpcklbw xmm1, xmm2 /* GR */ \ | 2506 __asm punpcklbw xmm1, xmm2 /* GR */ \ |
| 2556 __asm punpcklbw xmm5, xmm0 /* AB */ \ | 2507 __asm punpcklbw xmm5, xmm0 /* AB */ \ |
| 2557 __asm movdqa xmm0, xmm5 \ | 2508 __asm movdqa xmm0, xmm5 \ |
| 2558 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ | 2509 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ |
| 2559 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ | 2510 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ |
| 2560 __asm movdqu 0[edx], xmm5 \ | 2511 __asm movdqu 0[edx], xmm5 \ |
| 2561 __asm movdqu 16[edx], xmm0 \ | 2512 __asm movdqu 16[edx], xmm0 \ |
| (...skipping 10 matching lines...) Expand all Loading... |
| 2572 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ | 2523 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
| 2573 /* RRGB -> RGB24 */ \ | 2524 /* RRGB -> RGB24 */ \ |
| 2574 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ | 2525 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ |
| 2575 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ | 2526 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ |
| 2576 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ | 2527 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ |
| 2577 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ | 2528 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ |
| 2578 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ | 2529 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ |
| 2579 __asm lea edx, [edx + 24] \ | 2530 __asm lea edx, [edx + 24] \ |
| 2580 } | 2531 } |
| 2581 | 2532 |
| 2582 // Store 8 RAW values. | |
| 2583 #define STORERAW __asm { \ | |
| 2584 /* Weave into RRGB */ \ | |
| 2585 __asm punpcklbw xmm0, xmm1 /* BG */ \ | |
| 2586 __asm punpcklbw xmm2, xmm2 /* RR */ \ | |
| 2587 __asm movdqa xmm1, xmm0 \ | |
| 2588 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ | |
| 2589 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ | |
| 2590 /* Step 4: RRGB -> RAW */ \ | |
| 2591 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ | |
| 2592 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ | |
| 2593 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ | |
| 2594 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ | |
| 2595 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ | |
| 2596 __asm lea edx, [edx + 24] \ | |
| 2597 } | |
| 2598 | |
| 2599 // Store 8 RGB565 values. | 2533 // Store 8 RGB565 values. |
| 2600 #define STORERGB565 __asm { \ | 2534 #define STORERGB565 __asm { \ |
| 2601 /* Weave into RRGB */ \ | 2535 /* Weave into RRGB */ \ |
| 2602 __asm punpcklbw xmm0, xmm1 /* BG */ \ | 2536 __asm punpcklbw xmm0, xmm1 /* BG */ \ |
| 2603 __asm punpcklbw xmm2, xmm2 /* RR */ \ | 2537 __asm punpcklbw xmm2, xmm2 /* RR */ \ |
| 2604 __asm movdqa xmm1, xmm0 \ | 2538 __asm movdqa xmm1, xmm0 \ |
| 2605 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ | 2539 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ |
| 2606 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ | 2540 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ |
| 2607 /* RRGB -> RGB565 */ \ | 2541 /* RRGB -> RGB565 */ \ |
| 2608 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ | 2542 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ |
| (...skipping 3644 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 6253 } | 6187 } |
| 6254 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6188 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 6255 | 6189 |
| 6256 #endif // defined(_M_X64) | 6190 #endif // defined(_M_X64) |
| 6257 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6191 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
| 6258 | 6192 |
| 6259 #ifdef __cplusplus | 6193 #ifdef __cplusplus |
| 6260 } // extern "C" | 6194 } // extern "C" |
| 6261 } // namespace libyuv | 6195 } // namespace libyuv |
| 6262 #endif | 6196 #endif |
| OLD | NEW |