| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 | 8 |
| 9 #include "SkBlitRow_opts_SSE2.h" | 9 #include "SkBlitRow_opts_SSE2.h" |
| 10 #include "SkBitmapProcState_opts_SSE2.h" | 10 #include "SkBitmapProcState_opts_SSE2.h" |
| (...skipping 896 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 907 // Calculate result b. | 907 // Calculate result b. |
| 908 __m128i b1 = _mm_srli_epi32(src_pixel1, | 908 __m128i b1 = _mm_srli_epi32(src_pixel1, |
| 909 SK_B32_SHIFT + (8 - SK_B16_BITS)); | 909 SK_B32_SHIFT + (8 - SK_B16_BITS)); |
| 910 b1 = _mm_and_si128(b1, b16_mask); | 910 b1 = _mm_and_si128(b1, b16_mask); |
| 911 __m128i b2 = _mm_srli_epi32(src_pixel2, | 911 __m128i b2 = _mm_srli_epi32(src_pixel2, |
| 912 SK_B32_SHIFT + (8 - SK_B16_BITS)); | 912 SK_B32_SHIFT + (8 - SK_B16_BITS)); |
| 913 b2 = _mm_and_si128(b2, b16_mask); | 913 b2 = _mm_and_si128(b2, b16_mask); |
| 914 __m128i b = _mm_packs_epi32(b1, b2); | 914 __m128i b = _mm_packs_epi32(b1, b2); |
| 915 | 915 |
| 916 // Store 8 16-bit colors in dst. | 916 // Store 8 16-bit colors in dst. |
| 917 __m128i d_pixel = SkPackRGB16_SSE(r, g, b); | 917 __m128i d_pixel = SkPackRGB16_SSE2(r, g, b); |
| 918 _mm_store_si128(d++, d_pixel); | 918 _mm_store_si128(d++, d_pixel); |
| 919 count -= 8; | 919 count -= 8; |
| 920 } | 920 } |
| 921 src = reinterpret_cast<const SkPMColor*>(s); | 921 src = reinterpret_cast<const SkPMColor*>(s); |
| 922 dst = reinterpret_cast<uint16_t*>(d); | 922 dst = reinterpret_cast<uint16_t*>(d); |
| 923 } | 923 } |
| 924 | 924 |
| 925 if (count > 0) { | 925 if (count > 0) { |
| 926 do { | 926 do { |
| 927 SkPMColor c = *src++; | 927 SkPMColor c = *src++; |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 976 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { | 976 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { |
| 977 d++; | 977 d++; |
| 978 count -= 8; | 978 count -= 8; |
| 979 continue; | 979 continue; |
| 980 } | 980 } |
| 981 | 981 |
| 982 // Load 8 pixels of dst. | 982 // Load 8 pixels of dst. |
| 983 __m128i dst_pixel = _mm_load_si128(d); | 983 __m128i dst_pixel = _mm_load_si128(d); |
| 984 | 984 |
| 985 // Extract A from src. | 985 // Extract A from src. |
| 986 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); | 986 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); |
| 987 sa1 = _mm_srli_epi32(sa1, 24); | 987 sa1 = _mm_srli_epi32(sa1, 24); |
| 988 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); | 988 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); |
| 989 sa2 = _mm_srli_epi32(sa2, 24); | 989 sa2 = _mm_srli_epi32(sa2, 24); |
| 990 __m128i sa = _mm_packs_epi32(sa1, sa2); | 990 __m128i sa = _mm_packs_epi32(sa1, sa2); |
| 991 | 991 |
| 992 // Extract R from src. | 992 // Extract R from src. |
| 993 __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT)); | 993 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); |
| 994 sr1 = _mm_srli_epi32(sr1, 24); | 994 sr1 = _mm_srli_epi32(sr1, 24); |
| 995 __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT)); | 995 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); |
| 996 sr2 = _mm_srli_epi32(sr2, 24); | 996 sr2 = _mm_srli_epi32(sr2, 24); |
| 997 __m128i sr = _mm_packs_epi32(sr1, sr2); | 997 __m128i sr = _mm_packs_epi32(sr1, sr2); |
| 998 | 998 |
| 999 // Extract G from src. | 999 // Extract G from src. |
| 1000 __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT)); | 1000 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); |
| 1001 sg1 = _mm_srli_epi32(sg1, 24); | 1001 sg1 = _mm_srli_epi32(sg1, 24); |
| 1002 __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT)); | 1002 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); |
| 1003 sg2 = _mm_srli_epi32(sg2, 24); | 1003 sg2 = _mm_srli_epi32(sg2, 24); |
| 1004 __m128i sg = _mm_packs_epi32(sg1, sg2); | 1004 __m128i sg = _mm_packs_epi32(sg1, sg2); |
| 1005 | 1005 |
| 1006 // Extract B from src. | 1006 // Extract B from src. |
| 1007 __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT)); | 1007 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); |
| 1008 sb1 = _mm_srli_epi32(sb1, 24); | 1008 sb1 = _mm_srli_epi32(sb1, 24); |
| 1009 __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT)); | 1009 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); |
| 1010 sb2 = _mm_srli_epi32(sb2, 24); | 1010 sb2 = _mm_srli_epi32(sb2, 24); |
| 1011 __m128i sb = _mm_packs_epi32(sb1, sb2); | 1011 __m128i sb = _mm_packs_epi32(sb1, sb2); |
| 1012 | 1012 |
| 1013 // Extract R G B from dst. | 1013 // Extract R G B from dst. |
| 1014 __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT); | 1014 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); |
| 1015 dr = _mm_and_si128(dr, r16_mask); | 1015 dr = _mm_and_si128(dr, r16_mask); |
| 1016 __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT); | 1016 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); |
| 1017 dg = _mm_and_si128(dg, g16_mask); | 1017 dg = _mm_and_si128(dg, g16_mask); |
| 1018 __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT); | 1018 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); |
| 1019 db = _mm_and_si128(db, b16_mask); | 1019 db = _mm_and_si128(db, b16_mask); |
| 1020 | 1020 |
| 1021 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa | 1021 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa |
| 1022 | 1022 |
| 1023 // Calculate R G B of result. | 1023 // Calculate R G B of result. |
| 1024 // Original algorithm is in SkSrcOver32To16(). | 1024 // Original algorithm is in SkSrcOver32To16(). |
| 1025 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS)); | 1025 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS))
; |
| 1026 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); | 1026 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); |
| 1027 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS)); | 1027 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS))
; |
| 1028 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); | 1028 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); |
| 1029 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS)); | 1029 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS))
; |
| 1030 db = _mm_srli_epi16(db, 8 - SK_B16_BITS); | 1030 db = _mm_srli_epi16(db, 8 - SK_B16_BITS); |
| 1031 | 1031 |
| 1032 // Pack R G B into 16-bit color. | 1032 // Pack R G B into 16-bit color. |
| 1033 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); | 1033 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); |
| 1034 | 1034 |
| 1035 // Store 8 16-bit colors in dst. | 1035 // Store 8 16-bit colors in dst. |
| 1036 _mm_store_si128(d++, d_pixel); | 1036 _mm_store_si128(d++, d_pixel); |
| 1037 count -= 8; | 1037 count -= 8; |
| 1038 } | 1038 } |
| 1039 | 1039 |
| 1040 src = reinterpret_cast<const SkPMColor*>(s); | 1040 src = reinterpret_cast<const SkPMColor*>(s); |
| 1041 dst = reinterpret_cast<uint16_t*>(d); | 1041 dst = reinterpret_cast<uint16_t*>(d); |
| 1042 } | 1042 } |
| 1043 | 1043 |
| (...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1136 sb2 = _mm_srli_epi32(sb2, 24); | 1136 sb2 = _mm_srli_epi32(sb2, 24); |
| 1137 __m128i sb = _mm_packs_epi32(sb1, sb2); | 1137 __m128i sb = _mm_packs_epi32(sb1, sb2); |
| 1138 | 1138 |
| 1139 // SkDITHER_R32To565(sb, dither) | 1139 // SkDITHER_R32To565(sb, dither) |
| 1140 __m128i sb_offset = _mm_srli_epi16(sb, 5); | 1140 __m128i sb_offset = _mm_srli_epi16(sb, 5); |
| 1141 sb = _mm_add_epi16(sb, dither); | 1141 sb = _mm_add_epi16(sb, dither); |
| 1142 sb = _mm_sub_epi16(sb, sb_offset); | 1142 sb = _mm_sub_epi16(sb, sb_offset); |
| 1143 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); | 1143 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); |
| 1144 | 1144 |
| 1145 // Pack and store 16-bit dst pixel. | 1145 // Pack and store 16-bit dst pixel. |
| 1146 __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb); | 1146 __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb); |
| 1147 _mm_store_si128(d++, d_pixel); | 1147 _mm_store_si128(d++, d_pixel); |
| 1148 | 1148 |
| 1149 count -= 8; | 1149 count -= 8; |
| 1150 x += 8; | 1150 x += 8; |
| 1151 } | 1151 } |
| 1152 | 1152 |
| 1153 src = reinterpret_cast<const SkPMColor*>(s); | 1153 src = reinterpret_cast<const SkPMColor*>(s); |
| 1154 dst = reinterpret_cast<uint16_t*>(d); | 1154 dst = reinterpret_cast<uint16_t*>(d); |
| 1155 } | 1155 } |
| 1156 | 1156 |
| (...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1235 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); | 1235 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); |
| 1236 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); | 1236 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); |
| 1237 | 1237 |
| 1238 while (count >= 8) { | 1238 while (count >= 8) { |
| 1239 // Load 8 pixels of src and dst. | 1239 // Load 8 pixels of src and dst. |
| 1240 __m128i src_pixel1 = _mm_loadu_si128(s++); | 1240 __m128i src_pixel1 = _mm_loadu_si128(s++); |
| 1241 __m128i src_pixel2 = _mm_loadu_si128(s++); | 1241 __m128i src_pixel2 = _mm_loadu_si128(s++); |
| 1242 __m128i dst_pixel = _mm_load_si128(d); | 1242 __m128i dst_pixel = _mm_load_si128(d); |
| 1243 | 1243 |
| 1244 // Extract A from src. | 1244 // Extract A from src. |
| 1245 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); | 1245 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); |
| 1246 sa1 = _mm_srli_epi32(sa1, 24); | 1246 sa1 = _mm_srli_epi32(sa1, 24); |
| 1247 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); | 1247 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); |
| 1248 sa2 = _mm_srli_epi32(sa2, 24); | 1248 sa2 = _mm_srli_epi32(sa2, 24); |
| 1249 __m128i sa = _mm_packs_epi32(sa1, sa2); | 1249 __m128i sa = _mm_packs_epi32(sa1, sa2); |
| 1250 | 1250 |
| 1251 // Calculate current dither value. | 1251 // Calculate current dither value. |
| 1252 dither_cur = _mm_mullo_epi16(dither, | 1252 dither_cur = _mm_mullo_epi16(dither, |
| 1253 _mm_add_epi16(sa, _mm_set1_epi16(1))); | 1253 _mm_add_epi16(sa, _mm_set1_epi16(1))); |
| 1254 dither_cur = _mm_srli_epi16(dither_cur, 8); | 1254 dither_cur = _mm_srli_epi16(dither_cur, 8); |
| 1255 | 1255 |
| 1256 // Extract R from src. | 1256 // Extract R from src. |
| 1257 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); | 1257 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); |
| (...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1316 | 1316 |
| 1317 dg = _mm_mullo_epi16(dg, isa); | 1317 dg = _mm_mullo_epi16(dg, isa); |
| 1318 dg = _mm_add_epi16(dg, sg); | 1318 dg = _mm_add_epi16(dg, sg); |
| 1319 dg = _mm_srli_epi16(dg, 5); | 1319 dg = _mm_srli_epi16(dg, 5); |
| 1320 | 1320 |
| 1321 db = _mm_mullo_epi16(db, isa); | 1321 db = _mm_mullo_epi16(db, isa); |
| 1322 db = _mm_add_epi16(db, sb); | 1322 db = _mm_add_epi16(db, sb); |
| 1323 db = _mm_srli_epi16(db, 5); | 1323 db = _mm_srli_epi16(db, 5); |
| 1324 | 1324 |
| 1325 // Package and store dst pixel. | 1325 // Package and store dst pixel. |
| 1326 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); | 1326 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); |
| 1327 _mm_store_si128(d++, d_pixel); | 1327 _mm_store_si128(d++, d_pixel); |
| 1328 | 1328 |
| 1329 count -= 8; | 1329 count -= 8; |
| 1330 x += 8; | 1330 x += 8; |
| 1331 } | 1331 } |
| 1332 | 1332 |
| 1333 src = reinterpret_cast<const SkPMColor*>(s); | 1333 src = reinterpret_cast<const SkPMColor*>(s); |
| 1334 dst = reinterpret_cast<uint16_t*>(d); | 1334 dst = reinterpret_cast<uint16_t*>(d); |
| 1335 } | 1335 } |
| 1336 | 1336 |
| (...skipping 18 matching lines...) Expand all Loading... |
| 1355 uint32_t dst_expanded = SkExpand_rgb_16(*dst); | 1355 uint32_t dst_expanded = SkExpand_rgb_16(*dst); |
| 1356 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | 1356 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
| 1357 // now src and dst expanded are in g:11 r:10 x:1 b:10 | 1357 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
| 1358 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1358 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
| 1359 } | 1359 } |
| 1360 dst += 1; | 1360 dst += 1; |
| 1361 DITHER_INC_X(x); | 1361 DITHER_INC_X(x); |
| 1362 } while (--count != 0); | 1362 } while (--count != 0); |
| 1363 } | 1363 } |
| 1364 } | 1364 } |
| OLD | NEW |