OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 | 8 |
9 #include "SkBlitRow_opts_SSE2.h" | 9 #include "SkBlitRow_opts_SSE2.h" |
10 #include "SkBitmapProcState_opts_SSE2.h" | 10 #include "SkBitmapProcState_opts_SSE2.h" |
11 #include "SkColorPriv.h" | 11 #include "SkColorPriv.h" |
| 12 #include "SkColor_opts_SSE2.h" |
12 #include "SkUtils.h" | 13 #include "SkUtils.h" |
13 | 14 |
14 #include <emmintrin.h> | 15 #include <emmintrin.h> |
15 | 16 |
16 /* SSE2 version of S32_Blend_BlitRow32() | 17 /* SSE2 version of S32_Blend_BlitRow32() |
17 * portable version is in core/SkBlitRow_D32.cpp | 18 * portable version is in core/SkBlitRow_D32.cpp |
18 */ | 19 */ |
19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, | 20 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
20 const SkPMColor* SK_RESTRICT src, | 21 const SkPMColor* SK_RESTRICT src, |
21 int count, U8CPU alpha) { | 22 int count, U8CPU alpha) { |
(...skipping 822 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
844 dst = reinterpret_cast<SkPMColor*>(d); | 845 dst = reinterpret_cast<SkPMColor*>(d); |
845 } | 846 } |
846 | 847 |
847 while (width > 0) { | 848 while (width > 0) { |
848 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); | 849 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
849 mask++; | 850 mask++; |
850 dst++; | 851 dst++; |
851 width--; | 852 width--; |
852 } | 853 } |
853 } | 854 } |
| 855 |
| 856 /* SSE2 version of S32A_D565_Opaque() |
| 857 * portable version is in core/SkBlitRow_D16.cpp |
| 858 */ |
| 859 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, |
| 860 const SkPMColor* SK_RESTRICT src, |
| 861 int count, U8CPU alpha, int /*x*/, int /*y*/) { |
| 862 SkASSERT(255 == alpha); |
| 863 |
| 864 if (count <= 0) { |
| 865 return; |
| 866 } |
| 867 |
| 868 if (count >= 8) { |
| 869 // Make dst 16 bytes alignment |
| 870 while (((size_t)dst & 0x0F) != 0) { |
| 871 SkPMColor c = *src++; |
| 872 if (c) { |
| 873 *dst = SkSrcOver32To16(c, *dst); |
| 874 } |
| 875 dst += 1; |
| 876 count--; |
| 877 } |
| 878 |
| 879 const __m128i* s = reinterpret_cast<const __m128i*>(src); |
| 880 __m128i* d = reinterpret_cast<__m128i*>(dst); |
| 881 __m128i var255 = _mm_set1_epi16(255); |
| 882 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); |
| 883 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); |
| 884 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); |
| 885 |
| 886 while (count >= 8) { |
| 887 // Load 8 pixels of src. |
| 888 __m128i src_pixel1 = _mm_loadu_si128(s++); |
| 889 __m128i src_pixel2 = _mm_loadu_si128(s++); |
| 890 |
| 891 // Check whether src pixels are equal to 0 and get the highest bit |
| 892 // of each byte of result, if src pixels are all zero, src_cmp1 and |
| 893 // src_cmp2 will be 0xFFFF. |
| 894 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, |
| 895 _mm_setzero_si128())); |
| 896 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, |
| 897 _mm_setzero_si128())); |
| 898 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { |
| 899 d++; |
| 900 count -= 8; |
| 901 continue; |
| 902 } |
| 903 |
| 904 // Load 8 pixels of dst. |
| 905 __m128i dst_pixel = _mm_load_si128(d); |
| 906 |
| 907 // Extract A from src. |
| 908 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); |
| 909 sa1 = _mm_srli_epi32(sa1, 24); |
| 910 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); |
| 911 sa2 = _mm_srli_epi32(sa2, 24); |
| 912 __m128i sa = _mm_packs_epi32(sa1, sa2); |
| 913 |
| 914 // Extract R from src. |
| 915 __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT)); |
| 916 sr1 = _mm_srli_epi32(sr1, 24); |
| 917 __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT)); |
| 918 sr2 = _mm_srli_epi32(sr2, 24); |
| 919 __m128i sr = _mm_packs_epi32(sr1, sr2); |
| 920 |
| 921 // Extract G from src. |
| 922 __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT)); |
| 923 sg1 = _mm_srli_epi32(sg1, 24); |
| 924 __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT)); |
| 925 sg2 = _mm_srli_epi32(sg2, 24); |
| 926 __m128i sg = _mm_packs_epi32(sg1, sg2); |
| 927 |
| 928 // Extract B from src. |
| 929 __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT)); |
| 930 sb1 = _mm_srli_epi32(sb1, 24); |
| 931 __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT)); |
| 932 sb2 = _mm_srli_epi32(sb2, 24); |
| 933 __m128i sb = _mm_packs_epi32(sb1, sb2); |
| 934 |
| 935 // Extract R G B from dst. |
| 936 __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT); |
| 937 dr = _mm_and_si128(dr, r16_mask); |
| 938 __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT); |
| 939 dg = _mm_and_si128(dg, g16_mask); |
| 940 __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT); |
| 941 db = _mm_and_si128(db, b16_mask); |
| 942 |
| 943 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa |
| 944 |
| 945 // Calculate R G B of result. |
| 946 // Original algorithm is in SkSrcOver32To16(). |
| 947 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS)); |
| 948 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); |
| 949 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS)); |
| 950 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); |
| 951 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS)); |
| 952 db = _mm_srli_epi16(db, 8 - SK_B16_BITS); |
| 953 |
| 954 // Pack R G B into 16-bit color. |
| 955 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); |
| 956 |
| 957 // Store 8 16-bit colors in dst. |
| 958 _mm_store_si128(d++, d_pixel); |
| 959 count -= 8; |
| 960 } |
| 961 |
| 962 src = reinterpret_cast<const SkPMColor*>(s); |
| 963 dst = reinterpret_cast<uint16_t*>(d); |
| 964 } |
| 965 |
| 966 if (count > 0) { |
| 967 do { |
| 968 SkPMColor c = *src++; |
| 969 SkPMColorAssert(c); |
| 970 if (c) { |
| 971 *dst = SkSrcOver32To16(c, *dst); |
| 972 } |
| 973 dst += 1; |
| 974 } while (--count != 0); |
| 975 } |
| 976 } |
OLD | NEW |