src/opts/SkSwizzler_opts.h - Issue 1618003002: Use NEON optimizations for RGB -> RGB(FF) or BGR(FF) in SkSwizzler

Unified Diff: src/opts/SkSwizzler_opts.h

Issue 1618003002: Use NEON optimizations for RGB -> RGB(FF) or BGR(FF) in SkSwizzler (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Fix Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkSwizzler_opts.h

diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h

index 8d1be84df27490cf19d97f611d3550c2e900781f..ad121cfafecc42d33fddaa2227ed9fd0693e659b 100644

--- a/src/opts/SkSwizzler_opts.h

+++ b/src/opts/SkSwizzler_opts.h

@@ -60,6 +60,34 @@ static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {

}

+static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {

+ const uint8_t* src = (const uint8_t*)vsrc;

+ for (int i = 0; i < count; i++) {

+ uint8_t r = src[0],

+ g = src[1],

+ b = src[2];

+ src += 3;

+ dst[i] = (uint32_t)0xFF << 24

+ | (uint32_t)b << 16

+ | (uint32_t)g << 8

+ | (uint32_t)r << 0;

+ }

+static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {

+ const uint8_t* src = (const uint8_t*)vsrc;

+ for (int i = 0; i < count; i++) {

+ uint8_t r = src[0],

+ g = src[1],

+ b = src[2];

+ src += 3;

+ dst[i] = (uint32_t)0xFF << 24

+ | (uint32_t)r << 16

+ | (uint32_t)g << 8

+ | (uint32_t)b << 0;

+ }

#if defined(SK_ARM_HAS_NEON)

// Rounded divide by 255, (x + 127) / 255

@@ -96,12 +124,12 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {

auto src = (const uint32_t*)vsrc;

while (count >= 8) {

// Load 8 pixels.

- uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);

+ uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);

- uint8x8_t a = bgra.val[3],

- b = bgra.val[2],

- g = bgra.val[1],

- r = bgra.val[0];

+ uint8x8_t a = rgba.val[3],

+ b = rgba.val[2],

+ g = rgba.val[1],

+ r = rgba.val[0];

// Premultiply.

b = scale(b, a);

@@ -110,15 +138,15 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {

// Store 8 premultiplied pixels.

if (kSwapRB) {

- bgra.val[2] = r;

- bgra.val[1] = g;

- bgra.val[0] = b;

+ rgba.val[2] = r;

+ rgba.val[1] = g;

+ rgba.val[0] = b;

} else {

- bgra.val[2] = b;

- bgra.val[1] = g;

- bgra.val[0] = r;

+ rgba.val[2] = b;

+ rgba.val[1] = g;

+ rgba.val[0] = r;

}

- vst4_u8((uint8_t*) dst, bgra);

+ vst4_u8((uint8_t*) dst, rgba);

src += 8;

dst += 8;

count -= 8;

@@ -141,13 +169,13 @@ static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {

auto src = (const uint32_t*)vsrc;

while (count >= 16) {

// Load 16 pixels.

- uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src);

+ uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);

// Swap r and b.

- SkTSwap(bgra.val[0], bgra.val[2]);

+ SkTSwap(rgba.val[0], rgba.val[2]);

// Store 16 pixels.

- vst4q_u8((uint8_t*) dst, bgra);

+ vst4q_u8((uint8_t*) dst, rgba);

src += 16;

dst += 16;

count -= 16;

@@ -155,13 +183,13 @@ static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {

if (count >= 8) {

// Load 8 pixels.

- uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);

+ uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);

// Swap r and b.

- SkTSwap(bgra.val[0], bgra.val[2]);

+ SkTSwap(rgba.val[0], rgba.val[2]);

// Store 8 pixels.

- vst4_u8((uint8_t*) dst, bgra);

+ vst4_u8((uint8_t*) dst, rgba);

src += 8;

dst += 8;

count -= 8;

@@ -170,6 +198,68 @@ static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {

RGBA_to_BGRA_portable(dst, src, count);

}

+template <bool kSwapRB>

+static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {

+ const uint8_t* src = (const uint8_t*) vsrc;

+ while (count >= 16) {

+ // Load 16 pixels.

+ uint8x16x3_t rgb = vld3q_u8(src);

+ // Insert an opaque alpha channel and swap if needed.

+ uint8x16x4_t rgba;

+ if (kSwapRB) {

+ rgba.val[0] = rgb.val[2];

+ rgba.val[2] = rgb.val[0];

+ } else {

+ rgba.val[0] = rgb.val[0];

+ rgba.val[2] = rgb.val[2];

+ }

+ rgba.val[1] = rgb.val[1];

+ rgba.val[3] = vdupq_n_u8(0xFF);

+ // Store 16 pixels.

+ vst4q_u8((uint8_t*) dst, rgba);

+ src += 16*3;

+ dst += 16;

+ count -= 16;

+ }

+ if (count >= 8) {

+ // Load 8 pixels.

+ uint8x8x3_t rgb = vld3_u8(src);

+ // Insert an opaque alpha channel and swap if needed.

+ uint8x8x4_t rgba;

+ if (kSwapRB) {

+ rgba.val[0] = rgb.val[2];

+ rgba.val[2] = rgb.val[0];

+ } else {

+ rgba.val[0] = rgb.val[0];

+ rgba.val[2] = rgb.val[2];

+ }

+ rgba.val[1] = rgb.val[1];

+ rgba.val[3] = vdup_n_u8(0xFF);

+ // Store 8 pixels.

+ vst4_u8((uint8_t*) dst, rgba);

+ src += 8*3;

+ dst += 8;

+ count -= 8;

+ }

+ // Call portable code to finish up the tail of [0,8) pixels.

+ auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;

+ proc(dst, src, count);

+static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {

+ insert_alpha_should_swaprb<false>(dst, src, count);

+static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {

+ insert_alpha_should_swaprb<true>(dst, src, count);

#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

template <bool kSwapRB>

@@ -268,6 +358,14 @@ static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {

RGBA_to_BGRA_portable(dst, src, count);

}

+static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {

+ RGB_to_RGB1_portable(dst, src, count);

+static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {

+ RGB_to_BGR1_portable(dst, src, count);

#else

static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {

@@ -282,6 +380,14 @@ static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {

RGBA_to_BGRA_portable(dst, src, count);

}

+static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {

+ RGB_to_RGB1_portable(dst, src, count);

+static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {

+ RGB_to_BGR1_portable(dst, src, count);

#endif

}

« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »