OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
9 #include "SkBitmapProcState_opts_SSE2.h" | 9 #include "SkBitmapProcState_opts_SSE2.h" |
10 #include "SkBlitRow_opts_SSE2.h" | 10 #include "SkBlitRow_opts_SSE2.h" |
(...skipping 20 matching lines...) Expand all Loading... |
31 SkASSERT(((size_t)dst & 0x03) == 0); | 31 SkASSERT(((size_t)dst & 0x03) == 0); |
32 while (((size_t)dst & 0x0F) != 0) { | 32 while (((size_t)dst & 0x0F) != 0) { |
33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); | 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); |
34 src++; | 34 src++; |
35 dst++; | 35 dst++; |
36 count--; | 36 count--; |
37 } | 37 } |
38 | 38 |
39 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
40 __m128i *d = reinterpret_cast<__m128i*>(dst); | 40 __m128i *d = reinterpret_cast<__m128i*>(dst); |
41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); | |
43 | 41 |
44 // Move scale factors to upper byte of word | |
45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); | |
46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); | |
47 while (count >= 4) { | 42 while (count >= 4) { |
48 // Load 4 pixels each of src and dest. | 43 // Load 4 pixels each of src and dest. |
49 __m128i src_pixel = _mm_loadu_si128(s); | 44 __m128i src_pixel = _mm_loadu_si128(s); |
50 __m128i dst_pixel = _mm_load_si128(d); | 45 __m128i dst_pixel = _mm_load_si128(d); |
51 | 46 |
52 // Interleave Atom port 0/1 operations based on the execution port | 47 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale); |
53 // constraints that multiply can only be executed on port 0 (while | 48 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale); |
54 // boolean operations can be executed on either port 0 or port 1) | |
55 // because GCC currently doesn't do a good job scheduling | |
56 // instructions based on these constraints. | |
57 | |
58 // Get red and blue pixels into lower byte of each word. | |
59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) | |
60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
61 | |
62 // Multiply by scale. | |
63 // (4 x (0, rs.h, 0, bs.h)) | |
64 // where rs.h stands for the higher byte of r * scale, and | |
65 // bs.h the higher byte of b * scale. | |
66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); | |
67 | |
68 // Get alpha and green pixels into higher byte of each word. | |
69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) | |
70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); | |
71 | |
72 // Multiply by scale. | |
73 // (4 x (as.h, as.l, gs.h, gs.l)) | |
74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); | |
75 | |
76 // Clear the lower byte of the a*scale and g*scale results | |
77 // (4 x (as.h, 0, gs.h, 0)) | |
78 src_ag = _mm_and_si128(src_ag, ag_mask); | |
79 | |
80 // Operations the destination pixels are the same as on the | |
81 // source pixels. See the comments above. | |
82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); | |
83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); | |
84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); | |
85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); | |
86 dst_ag = _mm_and_si128(dst_ag, ag_mask); | |
87 | |
88 // Combine back into RGBA. | |
89 // (4 x (as.h, rs.h, gs.h, bs.h)) | |
90 src_pixel = _mm_or_si128(src_rb, src_ag); | |
91 dst_pixel = _mm_or_si128(dst_rb, dst_ag); | |
92 | 49 |
93 // Add result | 50 // Add result |
94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); | 51 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
95 _mm_store_si128(d, result); | 52 _mm_store_si128(d, result); |
96 s++; | 53 s++; |
97 d++; | 54 d++; |
98 count -= 4; | 55 count -= 4; |
99 } | 56 } |
100 src = reinterpret_cast<const SkPMColor*>(s); | 57 src = reinterpret_cast<const SkPMColor*>(s); |
101 dst = reinterpret_cast<SkPMColor*>(d); | 58 dst = reinterpret_cast<SkPMColor*>(d); |
(...skipping 259 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
361 SkASSERT(((size_t)dst & 0x03) == 0); | 318 SkASSERT(((size_t)dst & 0x03) == 0); |
362 while (((size_t)dst & 0x0F) != 0) { | 319 while (((size_t)dst & 0x0F) != 0) { |
363 *dst = color + SkAlphaMulQ(*src, scale); | 320 *dst = color + SkAlphaMulQ(*src, scale); |
364 src++; | 321 src++; |
365 dst++; | 322 dst++; |
366 count--; | 323 count--; |
367 } | 324 } |
368 | 325 |
369 const __m128i *s = reinterpret_cast<const __m128i*>(src); | 326 const __m128i *s = reinterpret_cast<const __m128i*>(src); |
370 __m128i *d = reinterpret_cast<__m128i*>(dst); | 327 __m128i *d = reinterpret_cast<__m128i*>(dst); |
371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); | |
372 __m128i src_scale_wide = _mm_set1_epi16(scale); | |
373 __m128i color_wide = _mm_set1_epi32(color); | 328 __m128i color_wide = _mm_set1_epi32(color); |
374 while (count >= 4) { | 329 while (count >= 4) { |
375 // Load 4 pixels each of src and dest. | |
376 __m128i src_pixel = _mm_loadu_si128(s); | 330 __m128i src_pixel = _mm_loadu_si128(s); |
| 331 src_pixel = SkAlphaMulQ_SSE2(src_pixel, scale); |
377 | 332 |
378 // Get red and blue pixels into lower byte of each word. | |
379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); | |
380 | |
381 // Get alpha and green into lower byte of each word. | |
382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); | |
383 | |
384 // Multiply by scale. | |
385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); | |
386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); | |
387 | |
388 // Divide by 256. | |
389 src_rb = _mm_srli_epi16(src_rb, 8); | |
390 src_ag = _mm_andnot_si128(rb_mask, src_ag); | |
391 | |
392 // Combine back into RGBA. | |
393 src_pixel = _mm_or_si128(src_rb, src_ag); | |
394 | |
395 // Add color to result. | |
396 __m128i result = _mm_add_epi8(color_wide, src_pixel); | 333 __m128i result = _mm_add_epi8(color_wide, src_pixel); |
397 | |
398 // Store result. | |
399 _mm_store_si128(d, result); | 334 _mm_store_si128(d, result); |
400 s++; | 335 s++; |
401 d++; | 336 d++; |
402 count -= 4; | 337 count -= 4; |
403 } | 338 } |
404 src = reinterpret_cast<const SkPMColor*>(s); | 339 src = reinterpret_cast<const SkPMColor*>(s); |
405 dst = reinterpret_cast<SkPMColor*>(d); | 340 dst = reinterpret_cast<SkPMColor*>(d); |
406 } | 341 } |
407 | 342 |
408 while (count > 0) { | 343 while (count > 0) { |
(...skipping 942 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1351 uint32_t dst_expanded = SkExpand_rgb_16(*dst); | 1286 uint32_t dst_expanded = SkExpand_rgb_16(*dst); |
1352 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); | 1287 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); |
1353 // now src and dst expanded are in g:11 r:10 x:1 b:10 | 1288 // now src and dst expanded are in g:11 r:10 x:1 b:10 |
1354 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); | 1289 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); |
1355 } | 1290 } |
1356 dst += 1; | 1291 dst += 1; |
1357 DITHER_INC_X(x); | 1292 DITHER_INC_X(x); |
1358 } while (--count != 0); | 1293 } while (--count != 0); |
1359 } | 1294 } |
1360 } | 1295 } |
OLD | NEW |