Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(134)

Side by Side Diff: src/opts/SkBlitRow_opts_SSE2.cpp

Issue 202903004: Xfermode: SSE2 implementation of multiply_modeproc (Closed) Base URL: https://skia.googlesource.com/skia.git@xfermode
Patch Set: add SkGetPacked(A/R/G/B)32_SSE2 function Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 8
9 #include "SkBlitRow_opts_SSE2.h" 9 #include "SkBlitRow_opts_SSE2.h"
10 #include "SkBitmapProcState_opts_SSE2.h" 10 #include "SkBitmapProcState_opts_SSE2.h"
(...skipping 895 matching lines...) Expand 10 before | Expand all | Expand 10 after
906 // Calculate result b. 906 // Calculate result b.
907 __m128i b1 = _mm_srli_epi32(src_pixel1, 907 __m128i b1 = _mm_srli_epi32(src_pixel1,
908 SK_B32_SHIFT + (8 - SK_B16_BITS)); 908 SK_B32_SHIFT + (8 - SK_B16_BITS));
909 b1 = _mm_and_si128(b1, b16_mask); 909 b1 = _mm_and_si128(b1, b16_mask);
910 __m128i b2 = _mm_srli_epi32(src_pixel2, 910 __m128i b2 = _mm_srli_epi32(src_pixel2,
911 SK_B32_SHIFT + (8 - SK_B16_BITS)); 911 SK_B32_SHIFT + (8 - SK_B16_BITS));
912 b2 = _mm_and_si128(b2, b16_mask); 912 b2 = _mm_and_si128(b2, b16_mask);
913 __m128i b = _mm_packs_epi32(b1, b2); 913 __m128i b = _mm_packs_epi32(b1, b2);
914 914
915 // Store 8 16-bit colors in dst. 915 // Store 8 16-bit colors in dst.
916 __m128i d_pixel = SkPackRGB16_SSE(r, g, b); 916 __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
917 _mm_store_si128(d++, d_pixel); 917 _mm_store_si128(d++, d_pixel);
918 count -= 8; 918 count -= 8;
919 } 919 }
920 src = reinterpret_cast<const SkPMColor*>(s); 920 src = reinterpret_cast<const SkPMColor*>(s);
921 dst = reinterpret_cast<uint16_t*>(d); 921 dst = reinterpret_cast<uint16_t*>(d);
922 } 922 }
923 923
924 if (count > 0) { 924 if (count > 0) {
925 do { 925 do {
926 SkPMColor c = *src++; 926 SkPMColor c = *src++;
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
975 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { 975 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
976 d++; 976 d++;
977 count -= 8; 977 count -= 8;
978 continue; 978 continue;
979 } 979 }
980 980
981 // Load 8 pixels of dst. 981 // Load 8 pixels of dst.
982 __m128i dst_pixel = _mm_load_si128(d); 982 __m128i dst_pixel = _mm_load_si128(d);
983 983
984 // Extract A from src. 984 // Extract A from src.
985 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); 985 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
986 sa1 = _mm_srli_epi32(sa1, 24); 986 sa1 = _mm_srli_epi32(sa1, 24);
987 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); 987 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
988 sa2 = _mm_srli_epi32(sa2, 24); 988 sa2 = _mm_srli_epi32(sa2, 24);
989 __m128i sa = _mm_packs_epi32(sa1, sa2); 989 __m128i sa = _mm_packs_epi32(sa1, sa2);
990 990
991 // Extract R from src. 991 // Extract R from src.
992 __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT)); 992 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
993 sr1 = _mm_srli_epi32(sr1, 24); 993 sr1 = _mm_srli_epi32(sr1, 24);
994 __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT)); 994 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
995 sr2 = _mm_srli_epi32(sr2, 24); 995 sr2 = _mm_srli_epi32(sr2, 24);
996 __m128i sr = _mm_packs_epi32(sr1, sr2); 996 __m128i sr = _mm_packs_epi32(sr1, sr2);
997 997
998 // Extract G from src. 998 // Extract G from src.
999 __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT)); 999 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1000 sg1 = _mm_srli_epi32(sg1, 24); 1000 sg1 = _mm_srli_epi32(sg1, 24);
1001 __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT)); 1001 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1002 sg2 = _mm_srli_epi32(sg2, 24); 1002 sg2 = _mm_srli_epi32(sg2, 24);
1003 __m128i sg = _mm_packs_epi32(sg1, sg2); 1003 __m128i sg = _mm_packs_epi32(sg1, sg2);
1004 1004
1005 // Extract B from src. 1005 // Extract B from src.
1006 __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT)); 1006 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1007 sb1 = _mm_srli_epi32(sb1, 24); 1007 sb1 = _mm_srli_epi32(sb1, 24);
1008 __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT)); 1008 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1009 sb2 = _mm_srli_epi32(sb2, 24); 1009 sb2 = _mm_srli_epi32(sb2, 24);
1010 __m128i sb = _mm_packs_epi32(sb1, sb2); 1010 __m128i sb = _mm_packs_epi32(sb1, sb2);
1011 1011
1012 // Extract R G B from dst. 1012 // Extract R G B from dst.
1013 __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT); 1013 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1014 dr = _mm_and_si128(dr, r16_mask); 1014 dr = _mm_and_si128(dr, r16_mask);
1015 __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT); 1015 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1016 dg = _mm_and_si128(dg, g16_mask); 1016 dg = _mm_and_si128(dg, g16_mask);
1017 __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT); 1017 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1018 db = _mm_and_si128(db, b16_mask); 1018 db = _mm_and_si128(db, b16_mask);
1019 1019
1020 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa 1020 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1021 1021
1022 // Calculate R G B of result. 1022 // Calculate R G B of result.
1023 // Original algorithm is in SkSrcOver32To16(). 1023 // Original algorithm is in SkSrcOver32To16().
1024 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS)); 1024 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS)) ;
1025 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); 1025 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
1026 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS)); 1026 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS)) ;
1027 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); 1027 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
1028 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS)); 1028 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS)) ;
1029 db = _mm_srli_epi16(db, 8 - SK_B16_BITS); 1029 db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1030 1030
1031 // Pack R G B into 16-bit color. 1031 // Pack R G B into 16-bit color.
1032 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); 1032 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1033 1033
1034 // Store 8 16-bit colors in dst. 1034 // Store 8 16-bit colors in dst.
1035 _mm_store_si128(d++, d_pixel); 1035 _mm_store_si128(d++, d_pixel);
1036 count -= 8; 1036 count -= 8;
1037 } 1037 }
1038 1038
1039 src = reinterpret_cast<const SkPMColor*>(s); 1039 src = reinterpret_cast<const SkPMColor*>(s);
1040 dst = reinterpret_cast<uint16_t*>(d); 1040 dst = reinterpret_cast<uint16_t*>(d);
1041 } 1041 }
1042 1042
1043 if (count > 0) { 1043 if (count > 0) {
1044 do { 1044 do {
1045 SkPMColor c = *src++; 1045 SkPMColor c = *src++;
1046 SkPMColorAssert(c); 1046 SkPMColorAssert(c);
1047 if (c) { 1047 if (c) {
1048 *dst = SkSrcOver32To16(c, *dst); 1048 *dst = SkSrcOver32To16(c, *dst);
1049 } 1049 }
1050 dst += 1; 1050 dst += 1;
1051 } while (--count != 0); 1051 } while (--count != 0);
1052 } 1052 }
1053 } 1053 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698