Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(134)

Side by Side Diff: src/opts/SkBlitRow_opts_SSE4.cpp

Issue 923523002: Replace SSE optimization of Color32A_D565 (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Fixed comment comment Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/SkBlitRow_opts_SSE4.h ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #include "SkBlitRow_opts_SSE4.h" 1 #include "SkBlitRow_opts_SSE4.h"
2 2
3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods. 3 // Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub met hods.
4 // The stubs should never be called, so we make them crash just to confirm that. 4 // The stubs should never be called, so we make them crash just to confirm that.
5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 5 #if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41
6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) { 6 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_REST RICT, int, U8CPU) {
7 sk_throw(); 7 sk_throw();
8 } 8 }
9 9
10 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
11 sk_throw();
12 }
13
14 #else 10 #else
15 11
16 #include <smmintrin.h> // SSE4.1 intrinsics 12 #include <smmintrin.h> // SSE4.1 intrinsics
17
18 #include "SkColorPriv.h" 13 #include "SkColorPriv.h"
19 #include "SkColor_opts_SSE2.h" 14 #include "SkColor_opts_SSE2.h"
20 15
21 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, 16 void S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
22 const SkPMColor* SK_RESTRICT src, 17 const SkPMColor* SK_RESTRICT src,
23 int count, 18 int count,
24 U8CPU alpha) { 19 U8CPU alpha) {
25 SkASSERT(alpha == 255); 20 SkASSERT(alpha == 255);
26 // As long as we can, we'll work on 16 pixel pairs at once. 21 // As long as we can, we'll work on 16 pixel pairs at once.
27 int count16 = count / 16; 22 int count16 = count / 16;
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
59 54
60 // Wrap up the last <= 15 pixels. 55 // Wrap up the last <= 15 pixels.
61 for (int i = count16*16; i < count; i++) { 56 for (int i = count16*16; i < count; i++) {
62 // This check is not really necessarily, but it prevents pointless autov ectorization. 57 // This check is not really necessarily, but it prevents pointless autov ectorization.
63 if (src[i] & 0xFF000000) { 58 if (src[i] & 0xFF000000) {
64 dst[i] = SkPMSrcOver(src[i], dst[i]); 59 dst[i] = SkPMSrcOver(src[i], dst[i]);
65 } 60 }
66 } 61 }
67 } 62 }
68 63
69 static inline uint16_t Color32A_D565_1x(uint16_t dst, unsigned scale, uint32_t s rc_expand) {
70 uint32_t dst_expand = SkExpand_rgb_16(dst) * scale;
71 return SkCompact_rgb_16((src_expand + dst_expand) >> 5);
72 }
73
74 void Color32A_D565_SSE4(uint16_t dst[], SkPMColor src, int count, int x, int y) {
75 SkASSERT(count > 0);
76
77 uint32_t src_expand = (SkGetPackedG32(src) << 24) |
78 (SkGetPackedR32(src) << 13) |
79 (SkGetPackedB32(src) << 2);
80 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
81
82 // Check if we have enough pixels to run SIMD
83 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
84 __m128i* dst_wide;
85 const __m128i src_expand_wide = _mm_set1_epi32(src_expand);
86 const __m128i scale_wide = _mm_set1_epi32(scale);
87 const __m128i mask_green = _mm_set1_epi32(SK_R16_MASK_IN_PLACE |
88 SK_B16_MASK_IN_PLACE |
89 (SK_G16_MASK_IN_PLACE << 16));
90
91 // Align dst to an even 16 byte address (0-7 pixels)
92 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
93 *dst = Color32A_D565_1x(*dst, scale, src_expand);
94 dst += 1;
95 count--;
96 }
97
98 dst_wide = reinterpret_cast<__m128i*>(dst);
99 do {
100 // Load 8 RGB565 pixels
101 __m128i pixels = _mm_load_si128(dst_wide);
102
103 // Duplicate and mask
104 __m128i pixels_high = _mm_unpackhi_epi16(pixels, pixels);
105 pixels_high = _mm_and_si128(mask_green, pixels_high);
106 pixels = _mm_unpacklo_epi16(pixels, pixels);
107 pixels = _mm_and_si128(mask_green, pixels);
108
109 // Scale with alpha
110 pixels_high = _mm_mullo_epi32(pixels_high, scale_wide);
111 pixels = _mm_mullo_epi32(pixels, scale_wide);
112
113 // Add src_expand_wide and shift down again
114 pixels_high = _mm_add_epi32(pixels_high, src_expand_wide);
115 pixels_high = _mm_srli_epi32(pixels_high, 5);
116 pixels = _mm_add_epi32(pixels, src_expand_wide);
117 pixels = _mm_srli_epi32(pixels, 5);
118
119 // Mask
120 pixels_high = _mm_and_si128(mask_green, pixels_high);
121 pixels = _mm_and_si128(mask_green, pixels);
122
123 // Combine into RGB565 and store
124 pixels = _mm_hadd_epi16(pixels, pixels_high);
125 _mm_store_si128(dst_wide, pixels);
126 count -= 8;
127 dst_wide++;
128 } while (count >= 8);
129
130 dst = reinterpret_cast<uint16_t*>(dst_wide);
131 }
132
133 // Small loop to handle remaining pixels.
134 while (count > 0) {
135 *dst = Color32A_D565_1x(*dst, scale, src_expand);
136 dst += 1;
137 count--;
138 }
139 }
140
141 #endif 64 #endif
OLDNEW
« no previous file with comments | « src/opts/SkBlitRow_opts_SSE4.h ('k') | src/opts/opts_check_x86.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698