Index: src/codec/SkSwizzler.cpp |
diff --git a/src/codec/SkSwizzler.cpp b/src/codec/SkSwizzler.cpp |
index 7865184cced94e91a0d22caf9291093ed6211c07..508cc8d71eaf2f9ca02ea8e2265b884a72407626 100644 |
--- a/src/codec/SkSwizzler.cpp |
+++ b/src/codec/SkSwizzler.cpp |
@@ -220,7 +220,75 @@ static void swizzle_small_index_to_n32( |
static void swizzle_index_to_n32( |
void* SK_RESTRICT dstRow, const uint8_t* SK_RESTRICT src, int dstWidth, |
int bpp, int deltaSrc, int offset, const SkPMColor ctable[]) { |
+#if defined(SK_ARM_HAS_NEON) |
+ uint32_t* dst = (uint32_t*) dstRow; |
+ src += offset; |
+ |
+ while (dstWidth >= 16) { |
+ // Table registers |
+ uint8x16x4_t t0, t1; |
+ uint8x8x4_t tr, tg, tb, ta; |
+ |
+ // Indices into table |
+ uint8x16_t indices = vld1q_u8(src); |
+ |
+ // Pixel output registers |
+ uint8x16x4_t rgba; |
+ rgba.val[0] = vdupq_n_u8(0); |
+ rgba.val[1] = vdupq_n_u8(0); |
+ rgba.val[2] = vdupq_n_u8(0); |
+ rgba.val[3] = vdupq_n_u8(0); |
+ |
+ const uint32_t* table = ctable; |
+ const int numColors = 256; |
+ const int numColorsPerLoop = 32; |
+ for (int j = 0; j < numColors / numColorsPerLoop; j++) { |
+ // Load a separate color table for each of r, g, b, a |
+ t0 = vld4q_u8((const uint8_t*) (table + 0)); // rgba |
+ t1 = vld4q_u8((const uint8_t*) (table + 16)); // RGBA |
+ SkTSwap(t0.val[1], t1.val[0]); // rRba, gGBA |
+ SkTSwap(t0.val[3], t1.val[2]); // rRbB, gGaA |
+ tr = *(((uint8x8x4_t*) &t0) + 0); // rR |
+ tb = *(((uint8x8x4_t*) &t0) + 1); // bB |
+ tg = *(((uint8x8x4_t*) &t1) + 0); // gG |
+ ta = *(((uint8x8x4_t*) &t1) + 1); // aA |
+ |
+ // Use VTBL, then OR the results together. |
+ rgba.val[0] = vorrq_u8(rgba.val[0], |
+ vcombine_u8(vtbl4_u8(tr, *(((uint8x8_t*) &indices) + 0)), |
+ vtbl4_u8(tr, *(((uint8x8_t*) &indices) + 1)))); |
+ rgba.val[1] = vorrq_u8(rgba.val[1], |
+ vcombine_u8(vtbl4_u8(tg, *(((uint8x8_t*) &indices) + 0)), |
+ vtbl4_u8(tg, *(((uint8x8_t*) &indices) + 1)))); |
+ rgba.val[2] = vorrq_u8(rgba.val[2], |
+ vcombine_u8(vtbl4_u8(tb, *(((uint8x8_t*) &indices) + 0)), |
+ vtbl4_u8(tb, *(((uint8x8_t*) &indices) + 1)))); |
+ rgba.val[3] = vorrq_u8(rgba.val[3], |
+ vcombine_u8(vtbl4_u8(ta, *(((uint8x8_t*) &indices) + 0)), |
+ vtbl4_u8(ta, *(((uint8x8_t*) &indices) + 1)))); |
+ |
+ // Move the next set of indices into the range of the color table. Indices |
+ // that are currently in range should go out of range due to underflow. |
+ indices = vsubq_u8(indices, vdupq_n_u8(32)); |
+ table += numColorsPerLoop; |
+ } |
+ |
+ // Store output pixel values. |
+ vst4q_u8((uint8_t*) dst, rgba); |
+ src += 16; |
+ dst += 16; |
+ dstWidth -= 16; |
+ } |
+ |
+ for (int x = 0; x < dstWidth; x++) { |
+ SkPMColor c = ctable[*src]; |
+ dst[x] = c; |
+ src += deltaSrc; |
+ } |
+ |
+ |
+#else |
src += offset; |
SkPMColor* SK_RESTRICT dst = (SkPMColor*)dstRow; |
for (int x = 0; x < dstWidth; x++) { |
@@ -228,6 +296,7 @@ static void swizzle_index_to_n32( |
dst[x] = c; |
src += deltaSrc; |
} |
+#endif |
} |
static void swizzle_index_to_n32_skipZ( |