Index: src/opts/SkSwizzler_opts.h |
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h |
index b0cf4cad5324eae5eb64fcce75eb755d253a02d5..8d1be84df27490cf19d97f611d3550c2e900781f 100644 |
--- a/src/opts/SkSwizzler_opts.h |
+++ b/src/opts/SkSwizzler_opts.h |
@@ -12,51 +12,51 @@ |
namespace SK_OPTS_NS { |
-// These variable names in these functions just pretend the input is BGRA. |
msarett
2016/01/22 15:35:08
I find this comment to still be useful? There is
mtklein
2016/01/22 15:39:52
Sort of... now that the order and the function nam
msarett
2016/01/22 15:40:51
Agreed that it's obvious. Let's drop it.
|
-// They work fine with both RGBA and BGRA. |
- |
-static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) { |
+static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { |
+ auto src = (const uint32_t*)vsrc; |
for (int i = 0; i < count; i++) { |
uint8_t a = src[i] >> 24, |
- r = src[i] >> 16, |
+ b = src[i] >> 16, |
g = src[i] >> 8, |
- b = src[i] >> 0; |
- r = (r*a+127)/255; |
- g = (g*a+127)/255; |
+ r = src[i] >> 0; |
b = (b*a+127)/255; |
+ g = (g*a+127)/255; |
+ r = (r*a+127)/255; |
dst[i] = (uint32_t)a << 24 |
- | (uint32_t)r << 16 |
+ | (uint32_t)b << 16 |
| (uint32_t)g << 8 |
- | (uint32_t)b << 0; |
+ | (uint32_t)r << 0; |
} |
} |
-static void premul_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) { |
+static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { |
+ auto src = (const uint32_t*)vsrc; |
for (int i = 0; i < count; i++) { |
uint8_t a = src[i] >> 24, |
- r = src[i] >> 16, |
+ b = src[i] >> 16, |
g = src[i] >> 8, |
- b = src[i] >> 0; |
- r = (r*a+127)/255; |
- g = (g*a+127)/255; |
+ r = src[i] >> 0; |
b = (b*a+127)/255; |
+ g = (g*a+127)/255; |
+ r = (r*a+127)/255; |
dst[i] = (uint32_t)a << 24 |
- | (uint32_t)b << 16 |
+ | (uint32_t)r << 16 |
| (uint32_t)g << 8 |
- | (uint32_t)r << 0; |
+ | (uint32_t)b << 0; |
} |
} |
-static void swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) { |
+static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) { |
+ auto src = (const uint32_t*)vsrc; |
for (int i = 0; i < count; i++) { |
uint8_t a = src[i] >> 24, |
- r = src[i] >> 16, |
+ b = src[i] >> 16, |
g = src[i] >> 8, |
- b = src[i] >> 0; |
+ r = src[i] >> 0; |
dst[i] = (uint32_t)a << 24 |
- | (uint32_t)b << 16 |
+ | (uint32_t)r << 16 |
| (uint32_t)g << 8 |
- | (uint32_t)r << 0; |
+ | (uint32_t)b << 0; |
} |
} |
@@ -92,30 +92,31 @@ static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { |
} |
template <bool kSwapRB> |
-static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { |
+static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
+ auto src = (const uint32_t*)vsrc; |
while (count >= 8) { |
// Load 8 pixels. |
uint8x8x4_t bgra = vld4_u8((const uint8_t*) src); |
uint8x8_t a = bgra.val[3], |
- r = bgra.val[2], |
+ b = bgra.val[2], |
g = bgra.val[1], |
- b = bgra.val[0]; |
+ r = bgra.val[0]; |
// Premultiply. |
- r = scale(r, a); |
- g = scale(g, a); |
b = scale(b, a); |
+ g = scale(g, a); |
+ r = scale(r, a); |
// Store 8 premultiplied pixels. |
if (kSwapRB) { |
- bgra.val[2] = b; |
- bgra.val[1] = g; |
- bgra.val[0] = r; |
- } else { |
bgra.val[2] = r; |
bgra.val[1] = g; |
bgra.val[0] = b; |
+ } else { |
+ bgra.val[2] = b; |
+ bgra.val[1] = g; |
+ bgra.val[0] = r; |
} |
vst4_u8((uint8_t*) dst, bgra); |
src += 8; |
@@ -124,19 +125,20 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int |
} |
// Call portable code to finish up the tail of [0,8) pixels. |
- auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; |
+ auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; |
proc(dst, src, count); |
} |
-static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
- premul_xxxa_should_swaprb<false>(dst, src, count); |
+static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
+ premul_should_swapRB<false>(dst, src, count); |
} |
-static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
- premul_xxxa_should_swaprb<true>(dst, src, count); |
+static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
+ premul_should_swapRB<true>(dst, src, count); |
} |
-static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
+static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { |
+ auto src = (const uint32_t*)vsrc; |
while (count >= 16) { |
// Load 16 pixels. |
uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src); |
@@ -165,13 +167,14 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
count -= 8; |
} |
- swaprb_xxxa_portable(dst, src, count); |
+ RGBA_to_BGRA_portable(dst, src, count); |
} |
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
template <bool kSwapRB> |
-static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) { |
+static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
+ auto src = (const uint32_t*)vsrc; |
auto premul8 = [](__m128i* lo, __m128i* hi) { |
const __m128i zeros = _mm_setzero_si128(); |
@@ -185,27 +188,27 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int |
} |
// Swizzle the pixels to 8-bit planar. |
- *lo = _mm_shuffle_epi8(*lo, planar); // bbbbgggg rrrraaaa |
- *hi = _mm_shuffle_epi8(*hi, planar); // BBBBGGGG RRRRAAAA |
- __m128i bg = _mm_unpacklo_epi32(*lo, *hi), // bbbbBBBB ggggGGGG |
- ra = _mm_unpackhi_epi32(*lo, *hi); // rrrrRRRR aaaaAAAA |
+ *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa |
+ *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA |
+ __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG |
+ ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA |
// Unpack to 16-bit planar. |
- __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_B_B_B_ |
- g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_G_G_G_ |
- r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_R_R_R_ |
- a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_A_A_A_ |
+ __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ |
+ g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ |
+ b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ |
+ a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ |
// Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. |
- b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); |
- g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); |
r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); |
+ g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); |
+ b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); |
// Repack into interlaced pixels. |
- bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BGBGBGBG |
- ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RARARARA |
- *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bgrabgra |
- *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BGRABGRA |
+ rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG |
+ ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA |
+ *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba |
+ *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA |
}; |
while (count >= 8) { |
@@ -236,46 +239,47 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int |
} |
// Call portable code to finish up the tail of [0,4) pixels. |
- auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable; |
+ auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; |
proc(dst, src, count); |
} |
-static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
- premul_xxxa_should_swaprb<false>(dst, src, count); |
+static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
+ premul_should_swapRB<false>(dst, src, count); |
} |
-static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
- premul_xxxa_should_swaprb<true>(dst, src, count); |
+static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
+ premul_should_swapRB<true>(dst, src, count); |
} |
-static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
+static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { |
+ auto src = (const uint32_t*)vsrc; |
const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); |
while (count >= 4) { |
- __m128i bgra = _mm_loadu_si128((const __m128i*) src); |
- __m128i rgba = _mm_shuffle_epi8(bgra, swapRB); |
- _mm_storeu_si128((__m128i*) dst, rgba); |
+ __m128i rgba = _mm_loadu_si128((const __m128i*) src); |
+ __m128i bgra = _mm_shuffle_epi8(rgba, swapRB); |
+ _mm_storeu_si128((__m128i*) dst, bgra); |
src += 4; |
dst += 4; |
count -= 4; |
} |
- swaprb_xxxa_portable(dst, src, count); |
+ RGBA_to_BGRA_portable(dst, src, count); |
} |
#else |
-static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
- premul_xxxa_portable(dst, src, count); |
+static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
+ RGBA_to_rgbA_portable(dst, src, count); |
} |
-static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
- premul_swaprb_xxxa_portable(dst, src, count); |
+static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
+ RGBA_to_bgrA_portable(dst, src, count); |
} |
-static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) { |
- swaprb_xxxa_portable(dst, src, count); |
+static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { |
+ RGBA_to_BGRA_portable(dst, src, count); |
} |
#endif |