src/opts/SkSwizzler_opts.h - Issue 1626463002: Refactor swizzle names and types.

Unified Diff: src/opts/SkSwizzler_opts.h

Issue 1626463002: Refactor swizzle names and types. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkSwizzler_opts.h

diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h

index b0cf4cad5324eae5eb64fcce75eb755d253a02d5..8d1be84df27490cf19d97f611d3550c2e900781f 100644

--- a/src/opts/SkSwizzler_opts.h

+++ b/src/opts/SkSwizzler_opts.h

@@ -12,51 +12,51 @@

namespace SK_OPTS_NS {

-// These variable names in these functions just pretend the input is BGRA.

msarett 2016/01/22 15:35:08 I find this comment to still be useful? There is

mtklein 2016/01/22 15:39:52 Sort of... now that the order and the function nam

msarett 2016/01/22 15:40:51 Agreed that it's obvious. Let's drop it.

-// They work fine with both RGBA and BGRA.

-static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {

+static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {

+ auto src = (const uint32_t*)vsrc;

for (int i = 0; i < count; i++) {

uint8_t a = src[i] >> 24,

- r = src[i] >> 16,

+ b = src[i] >> 16,

g = src[i] >> 8,

- b = src[i] >> 0;

- r = (r*a+127)/255;

- g = (g*a+127)/255;

+ r = src[i] >> 0;

b = (b*a+127)/255;

+ g = (g*a+127)/255;

+ r = (r*a+127)/255;

dst[i] = (uint32_t)a << 24

- | (uint32_t)r << 16

+ | (uint32_t)b << 16

| (uint32_t)g << 8

- | (uint32_t)b << 0;

+ | (uint32_t)r << 0;

}

-static void premul_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {

+static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {

+ auto src = (const uint32_t*)vsrc;

for (int i = 0; i < count; i++) {

uint8_t a = src[i] >> 24,

- r = src[i] >> 16,

+ b = src[i] >> 16,

g = src[i] >> 8,

- b = src[i] >> 0;

- r = (r*a+127)/255;

- g = (g*a+127)/255;

+ r = src[i] >> 0;

b = (b*a+127)/255;

+ g = (g*a+127)/255;

+ r = (r*a+127)/255;

dst[i] = (uint32_t)a << 24

- | (uint32_t)b << 16

+ | (uint32_t)r << 16

| (uint32_t)g << 8

- | (uint32_t)r << 0;

+ | (uint32_t)b << 0;

}

-static void swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {

+static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {

+ auto src = (const uint32_t*)vsrc;

for (int i = 0; i < count; i++) {

uint8_t a = src[i] >> 24,

- r = src[i] >> 16,

+ b = src[i] >> 16,

g = src[i] >> 8,

- b = src[i] >> 0;

+ r = src[i] >> 0;

dst[i] = (uint32_t)a << 24

- | (uint32_t)b << 16

+ | (uint32_t)r << 16

| (uint32_t)g << 8

- | (uint32_t)r << 0;

+ | (uint32_t)b << 0;

}

@@ -92,30 +92,31 @@ static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {

}

template <bool kSwapRB>

-static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {

+static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {

+ auto src = (const uint32_t*)vsrc;

while (count >= 8) {

// Load 8 pixels.

uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);

uint8x8_t a = bgra.val[3],

- r = bgra.val[2],

+ b = bgra.val[2],

g = bgra.val[1],

- b = bgra.val[0];

+ r = bgra.val[0];

// Premultiply.

- r = scale(r, a);

- g = scale(g, a);

b = scale(b, a);

+ g = scale(g, a);

+ r = scale(r, a);

// Store 8 premultiplied pixels.

if (kSwapRB) {

- bgra.val[2] = b;

- bgra.val[1] = g;

- bgra.val[0] = r;

- } else {

bgra.val[2] = r;

bgra.val[1] = g;

bgra.val[0] = b;

+ } else {

+ bgra.val[2] = b;

+ bgra.val[1] = g;

+ bgra.val[0] = r;

}

vst4_u8((uint8_t*) dst, bgra);

src += 8;

@@ -124,19 +125,20 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int

}

// Call portable code to finish up the tail of [0,8) pixels.

- auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;

+ auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;

proc(dst, src, count);

}

-static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

- premul_xxxa_should_swaprb<false>(dst, src, count);

+static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {

+ premul_should_swapRB<false>(dst, src, count);

}

-static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

- premul_xxxa_should_swaprb<true>(dst, src, count);

+static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {

+ premul_should_swapRB<true>(dst, src, count);

}

-static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

+static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {

+ auto src = (const uint32_t*)vsrc;

while (count >= 16) {

// Load 16 pixels.

uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src);

@@ -165,13 +167,14 @@ static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

count -= 8;

}

- swaprb_xxxa_portable(dst, src, count);

+ RGBA_to_BGRA_portable(dst, src, count);

}

#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

template <bool kSwapRB>

-static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {

+static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {

+ auto src = (const uint32_t*)vsrc;

auto premul8 = [](__m128i* lo, __m128i* hi) {

const __m128i zeros = _mm_setzero_si128();

@@ -185,27 +188,27 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int

}

// Swizzle the pixels to 8-bit planar.

- *lo = _mm_shuffle_epi8(*lo, planar); // bbbbgggg rrrraaaa

- *hi = _mm_shuffle_epi8(*hi, planar); // BBBBGGGG RRRRAAAA

- __m128i bg = _mm_unpacklo_epi32(*lo, *hi), // bbbbBBBB ggggGGGG

- ra = _mm_unpackhi_epi32(*lo, *hi); // rrrrRRRR aaaaAAAA

+ *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa

+ *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA

+ __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG

+ ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA

// Unpack to 16-bit planar.

- __m128i b = _mm_unpacklo_epi8(bg, zeros), // b_b_b_b_ B_B_B_B_

- g = _mm_unpackhi_epi8(bg, zeros), // g_g_g_g_ G_G_G_G_

- r = _mm_unpacklo_epi8(ra, zeros), // r_r_r_r_ R_R_R_R_

- a = _mm_unpackhi_epi8(ra, zeros); // a_a_a_a_ A_A_A_A_

+ __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_

+ g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_

+ b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_

+ a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_

// Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.

- b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);

- g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);

r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);

+ g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);

+ b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);

// Repack into interlaced pixels.

- bg = _mm_or_si128(b, _mm_slli_epi16(g, 8)); // bgbgbgbg BGBGBGBG

- ra = _mm_or_si128(r, _mm_slli_epi16(a, 8)); // rararara RARARARA

- *lo = _mm_unpacklo_epi16(bg, ra); // bgrabgra bgrabgra

- *hi = _mm_unpackhi_epi16(bg, ra); // BRGABGRA BGRABGRA

+ rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG

+ ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA

+ *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba

+ *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA

};

while (count >= 8) {

@@ -236,46 +239,47 @@ static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int

}

// Call portable code to finish up the tail of [0,4) pixels.

- auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;

+ auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;

proc(dst, src, count);

}

-static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

- premul_xxxa_should_swaprb<false>(dst, src, count);

+static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {

+ premul_should_swapRB<false>(dst, src, count);

}

-static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

- premul_xxxa_should_swaprb<true>(dst, src, count);

+static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {

+ premul_should_swapRB<true>(dst, src, count);

}

-static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

+static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {

+ auto src = (const uint32_t*)vsrc;

const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);

while (count >= 4) {

- __m128i bgra = _mm_loadu_si128((const __m128i*) src);

- __m128i rgba = _mm_shuffle_epi8(bgra, swapRB);

- _mm_storeu_si128((__m128i*) dst, rgba);

+ __m128i rgba = _mm_loadu_si128((const __m128i*) src);

+ __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);

+ _mm_storeu_si128((__m128i*) dst, bgra);

src += 4;

dst += 4;

count -= 4;

}

- swaprb_xxxa_portable(dst, src, count);

+ RGBA_to_BGRA_portable(dst, src, count);

}

#else

-static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {

- premul_xxxa_portable(dst, src, count);

+static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {

+ RGBA_to_rgbA_portable(dst, src, count);

}

-static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

- premul_swaprb_xxxa_portable(dst, src, count);

+static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {

+ RGBA_to_bgrA_portable(dst, src, count);

}

-static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {

- swaprb_xxxa_portable(dst, src, count);

+static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {

+ RGBA_to_BGRA_portable(dst, src, count);

}

#endif

« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | tests/SwizzlerTest.cpp » ('j') | no next file with comments »