Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(960)

Unified Diff: src/opts/SkSwizzler_opts.h

Issue 1680743005: NEON Optimized RGBA->PMColor sampling in SkSwizzler (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Share implementations for sampleSize 4 and 8 Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/opts/SkSwizzler_opts.h
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h
index 15eec3a355a1bee38f28008c7ef0a1fc36ccafb5..345e6974d020ed1dcaeffa97a7951400c2867c2e 100644
--- a/src/opts/SkSwizzler_opts.h
+++ b/src/opts/SkSwizzler_opts.h
@@ -12,13 +12,16 @@
namespace SK_OPTS_NS {
+template <int kSampleSize>
static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
+ int j = 0;
for (int i = 0; i < count; i++) {
- uint8_t a = src[i] >> 24,
- b = src[i] >> 16,
- g = src[i] >> 8,
- r = src[i] >> 0;
+ uint8_t a = src[j] >> 24,
+ b = src[j] >> 16,
+ g = src[j] >> 8,
+ r = src[j] >> 0;
+ j += kSampleSize;
b = (b*a+127)/255;
g = (g*a+127)/255;
r = (r*a+127)/255;
@@ -29,13 +32,16 @@ static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
}
}
+template <int kSampleSize>
static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
auto src = (const uint32_t*)vsrc;
+ int j = 0;
for (int i = 0; i < count; i++) {
- uint8_t a = src[i] >> 24,
- b = src[i] >> 16,
- g = src[i] >> 8,
- r = src[i] >> 0;
+ uint8_t a = src[j] >> 24,
+ b = src[j] >> 16,
+ g = src[j] >> 8,
+ r = src[j] >> 0;
+ j += kSampleSize;
b = (b*a+127)/255;
g = (g*a+127)/255;
r = (r*a+127)/255;
@@ -225,7 +231,7 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
}
// Call portable code to finish up the tail of [0,8) pixels.
- auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
+ auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>;
proc(dst, src, count);
}
@@ -484,6 +490,121 @@ static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
inverted_cmyk_to<kBGR1>(dst, src, count);
}
+static void load_rgba_sample2(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, uint8x8_t* b,
+ uint8x8_t* a) {
+ // Load 16 pixels.
+ uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
+
+ // Choose 8 pixels.
+ // pxpxpxpxpxpxpxpx -> pppppppp
+ *r = vmovn_u16(vreinterpretq_u16_u8(rgba.val[0]));
+ *g = vmovn_u16(vreinterpretq_u16_u8(rgba.val[1]));
+ *b = vmovn_u16(vreinterpretq_u16_u8(rgba.val[2]));
+ *a = vmovn_u16(vreinterpretq_u16_u8(rgba.val[3]));
+}
+
+template <int kSampleSize>
+static void load_rgba_sample(const uint32_t* src, uint8x8_t* r, uint8x8_t* g, uint8x8_t* b,
+ uint8x8_t* a) {
+ uint8x8_t rgba0 = vld1_u8((const uint8_t*) src); // rgba xxxx
+ uint8x8_t rgba1 = vld1_u8((const uint8_t*) src + kSampleSize); // rgba xxxx
+ uint8x8_t rgba01 = vzip_u8(rgba0, rgba1).val[0]; // rrgg bbaa
+
+ uint8x8_t rgba2 = vld1_u8((const uint8_t*) src + 2*kSampleSize); // rgba xxxx
+ uint8x8_t rgba3 = vld1_u8((const uint8_t*) src + 3*kSampleSize); // rgba xxxx
+ uint8x8_t rgba23 = vzip_u8(rgba2, rgba3).val[0]; // rrgg bbaa
+
+ uint16x4x2_t rgba03 = vzip_u16(vreinterpret_u16_u8(rgba01), // rrrr gggg
+ vreinterpret_u16_u8(rgba23)); // bbbb aaaa
+
+ uint8x8_t rgba4 = vld1_u8((const uint8_t*) src + 4*kSampleSize); // rgba xxxx
+ uint8x8_t rgba5 = vld1_u8((const uint8_t*) src + 5*kSampleSize); // rgba xxxx
+ uint8x8_t rgba45 = vzip_u8(rgba4, rgba5).val[0]; // rrgg bbaa
+
+ uint8x8_t rgba6 = vld1_u8((const uint8_t*) src + 6*kSampleSize); // rgba xxxx
+ uint8x8_t rgba7 = vld1_u8((const uint8_t*) src + 7*kSampleSize); // rgba xxxx
+ uint8x8_t rgba67 = vzip_u8(rgba6, rgba7).val[0]; // rrgg bbaa
+
+ uint16x4x2_t rgba47 = vzip_u16(vreinterpret_u16_u8(rgba45), // rrrr gggg
+ vreinterpret_u16_u8(rgba67)); // bbbb aaaa
+
+ uint32x2x2_t rg = vzip_u32(vreinterpret_u32_u16(rgba03.val[0]), // rrrr rrrr
+ vreinterpret_u32_u16(rgba47.val[0])); // gggg gggg
+ uint32x2x2_t ba = vzip_u32(vreinterpret_u32_u16(rgba03.val[1]), // bbbb bbbb
+ vreinterpret_u32_u16(rgba47.val[1])); // aaaa aaaa
+
+ *r = vreinterpret_u8_u32(rg.val[0]);
+ *g = vreinterpret_u8_u32(rg.val[1]);
+ *b = vreinterpret_u8_u32(ba.val[0]);
+ *a = vreinterpret_u8_u32(ba.val[1]);
+}
+
+template <bool kSwapRB, int kSampleSize>
+static void premul_should_swapRB_sample(uint32_t* dst, const void* vsrc, int count) {
+ auto src = (const uint32_t*)vsrc;
+
+ // We must use 9 as the limit to be sure that we don't read past the end of our memory.
+ while (count >= 9) {
+ // Load pixels.
+ uint8x8_t r, g, b, a;
+ if (2 == kSampleSize) {
+ load_rgba_sample2(src, &r, &g, &b, &a);
+ } else {
+ load_rgba_sample<kSampleSize>(src, &r, &g, &b, &a);
+ }
+
+ // Premultiply.
+ r = scale(r, a);
+ g = scale(g, a);
+ b = scale(b, a);
+
+ // Store 8 premultiplied pixels.
+ uint8x8x4_t result;
+ if (kSwapRB) {
+ result.val[0] = b;
+ result.val[1] = g;
+ result.val[2] = r;
+ result.val[3] = a;
+ } else {
+ result.val[0] = r;
+ result.val[1] = g;
+ result.val[2] = b;
+ result.val[3] = a;
+ }
+ vst4_u8((uint8_t*) dst, result);
+ src += 8*kSampleSize;
+ dst += 8;
+ count -= 8;
+ }
+
+ auto proc = kSwapRB ? RGBA_to_bgrA_portable<kSampleSize> : RGBA_to_rgbA_portable<kSampleSize>;
+ proc(dst, src, count);
+}
+
+static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
+ premul_should_swapRB_sample<false, 2>(dst, src, count);
+}
+
+static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
+ premul_should_swapRB_sample<true, 2>(dst, src, count);
+}
+
+static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
+ premul_should_swapRB_sample<false, 4>(dst, src, count);
+}
+
+static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
+ premul_should_swapRB_sample<true, 4>(dst, src, count);
+}
+
+static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
+ premul_should_swapRB_sample<false, 8>(dst, src, count);
+}
+
+static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
+ premul_should_swapRB_sample<true, 8>(dst, src, count);
+}
+
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
// Scale a byte by another.
@@ -561,7 +682,7 @@ static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
}
// Call portable code to finish up the tail of [0,4) pixels.
- auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
+ auto proc = kSwapRB ? RGBA_to_bgrA_portable<1> : RGBA_to_rgbA_portable<1>;
proc(dst, src, count);
}
@@ -791,14 +912,38 @@ static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
inverted_cmyk_to<kBGR1>(dst, src, count);
}
+static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
+ RGBA_to_rgbA_portable<2>(dst, src, count);
+}
+
+static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
+ RGBA_to_bgrA_portable<2>(dst, src, count);
+}
+
+static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
+ RGBA_to_rgbA_portable<4>(dst, src, count);
+}
+
+static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
+ RGBA_to_bgrA_portable<4>(dst, src, count);
+}
+
+static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
+ RGBA_to_rgbA_portable<8>(dst, src, count);
+}
+
+static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
+ RGBA_to_bgrA_portable<8>(dst, src, count);
+}
+
#else
static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
- RGBA_to_rgbA_portable(dst, src, count);
+ RGBA_to_rgbA_portable<1>(dst, src, count);
}
static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
- RGBA_to_bgrA_portable(dst, src, count);
+ RGBA_to_bgrA_portable<1>(dst, src, count);
}
static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
@@ -833,6 +978,30 @@ static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
inverted_CMYK_to_BGR1_portable(dst, src, count);
}
+static void RGBA_to_rgbA_sample2(uint32_t dst[], const void* src, int count) {
+ RGBA_to_rgbA_portable<2>(dst, src, count);
+}
+
+static void RGBA_to_bgrA_sample2(uint32_t dst[], const void* src, int count) {
+ RGBA_to_bgrA_portable<2>(dst, src, count);
+}
+
+static void RGBA_to_rgbA_sample4(uint32_t dst[], const void* src, int count) {
+ RGBA_to_rgbA_portable<4>(dst, src, count);
+}
+
+static void RGBA_to_bgrA_sample4(uint32_t dst[], const void* src, int count) {
+ RGBA_to_bgrA_portable<4>(dst, src, count);
+}
+
+static void RGBA_to_rgbA_sample8(uint32_t dst[], const void* src, int count) {
+ RGBA_to_rgbA_portable<8>(dst, src, count);
+}
+
+static void RGBA_to_bgrA_sample8(uint32_t dst[], const void* src, int count) {
+ RGBA_to_bgrA_portable<8>(dst, src, count);
+}
+
#endif
}
« no previous file with comments | « src/opts/SkOpts_ssse3.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698