third_party/WebKit/Source/platform/image-encoders/JPEGImageEncoder.cpp - Issue 2576223002: NEON-ize RGBA to RGB code

Unified Diff: third_party/WebKit/Source/platform/image-encoders/JPEGImageEncoder.cpp

Issue 2576223002: NEON-ize RGBA to RGB code (Closed)

Patch Set: Copyright, fix Windows build. Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « third_party/WebKit/Source/platform/BUILD.gn ('k') | third_party/WebKit/Source/platform/image-encoders/JPEGImageEncoderTest.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/WebKit/Source/platform/image-encoders/JPEGImageEncoder.cpp

diff --git a/third_party/WebKit/Source/platform/image-encoders/JPEGImageEncoder.cpp b/third_party/WebKit/Source/platform/image-encoders/JPEGImageEncoder.cpp

index 0a70e73db5f7e4733d051ac4b8799add20c74c29..f33de9a045b648ece7f42431214a3799698c79f5 100644

--- a/third_party/WebKit/Source/platform/image-encoders/JPEGImageEncoder.cpp

+++ b/third_party/WebKit/Source/platform/image-encoders/JPEGImageEncoder.cpp

@@ -33,6 +33,7 @@

#include "SkColorPriv.h"

#include "platform/geometry/IntSize.h"

#include "platform/graphics/ImageBuffer.h"

+#include "platform/image-encoders/RGBAtoRGB.h"

#include "wtf/CurrentTime.h"

#include "wtf/PtrUtil.h"

#include <memory>

@@ -45,6 +46,97 @@ extern "C" {

namespace blink {

+void RGBAtoRGBScalar(const unsigned char* pixels,

msarett1 2016/12/16 13:07:54 libjpeg-turbo actually supports RGBA input (in add

cavalcantii1 2016/12/17 02:35:33 Acknowledged.

+ unsigned pixelCount,

+ unsigned char* output) {

+ // Per <canvas> spec, composite the input image pixels source-over on black.

+ for (; pixelCount-- > 0; pixels += 4) {

+ unsigned char alpha = pixels[3];

+ if (alpha != 255) {

+ *output++ = SkMulDiv255Round(pixels[0], alpha);

+ *output++ = SkMulDiv255Round(pixels[1], alpha);

+ *output++ = SkMulDiv255Round(pixels[2], alpha);

+ } else {

+ *output++ = pixels[0];

+ *output++ = pixels[1];

+ *output++ = pixels[2];

+ }

+// TODO(cavalcantii): use regular macro, see https://crbug.com/673067.

+#ifdef __ARM_NEON__

+void RGBAtoRGBNeon(const unsigned char* input,

+ const unsigned pixelCount,

+ unsigned char* output) {

+ const unsigned pixelsPerLoad = 16;

+ const unsigned rgbaStep = pixelsPerLoad * 4, rgbStep = pixelsPerLoad * 3;

+ // Input registers.

+ uint8x16x4_t rgba;

+ // Output registers.

+ uint8x16x3_t rgb;

+ // Intermediate registers.

+ uint16x8_t low, high;

+ uint8x16_t result;

+ unsigned counter;

+ for (counter = 0; counter + pixelsPerLoad <= pixelCount;

+ counter += pixelsPerLoad) {

+ // Reads 16 pixels at once, each color channel in a different

+ // 128 bits register.

+ rgba = vld4q_u8(input);

+ // Extracts the low/high part of the 128 bits, multiplying by the

+ // respective alpha channel.

+ low = vmull_u8(vget_low_u8(rgba.val[0]), vget_low_u8(rgba.val[3]));

+ high = vmull_u8(vget_high_u8(rgba.val[0]), vget_high_u8(rgba.val[3]));

+ // Original Skia formula is: (x + (x >> 8)) >> 8, where x = a*b + 128.

+ // This shifts and accumulates following by rounding in a single

+ // instruction.

+ low = vrsraq_n_u16(low, low, 8);

msarett1 2016/12/16 13:07:54 Skia has a NEON implementation of "mul and rounded

cavalcantii1 2016/12/17 02:35:33 I searched for SkSwizzler_opts.h and it seems it i

+ high = vrsraq_n_u16(high, high, 8);

+ // And now to the last shift and combining the vector.

+ result = vcombine_u8(vqrshrn_n_u16(low, 8), vqrshrn_n_u16(high, 8));

+ // Write back the Red channel to the first 128 bits register.

+ rgb.val[0] = result;

+ // Now the Green channel (don't trust the compiler to unroll the loop).

msarett1 2016/12/16 13:07:54 What about using an inline helper function? Ex: m

cavalcantii1 2016/12/17 02:35:33 I tested moving the pixel manipulation code to a l

+ low = vmull_u8(vget_low_u8(rgba.val[1]), vget_low_u8(rgba.val[3]));

+ high = vmull_u8(vget_high_u8(rgba.val[1]), vget_high_u8(rgba.val[3]));

+ low = vrsraq_n_u16(low, low, 8);

+ high = vrsraq_n_u16(high, high, 8);

+ result = vcombine_u8(vqrshrn_n_u16(low, 8), vqrshrn_n_u16(high, 8));

+ rgb.val[1] = result;

+ // Finally the Blue channel.

+ low = vmull_u8(vget_low_u8(rgba.val[2]), vget_low_u8(rgba.val[3]));

+ high = vmull_u8(vget_high_u8(rgba.val[2]), vget_high_u8(rgba.val[3]));

+ low = vrsraq_n_u16(low, low, 8);

+ high = vrsraq_n_u16(high, high, 8);

+ result = vcombine_u8(vqrshrn_n_u16(low, 8), vqrshrn_n_u16(high, 8));

+ rgb.val[2] = result;

+ // Write back (interleaved) results to output.

+ vst3q_u8(output, rgb);

+ // Advance to next elements (could be avoided loading register with

+ // increment after i.e. "vld4 {vector}, [r1]!").

+ input += rgbaStep;

+ output += rgbStep;

+ }

+ // Handle the tail elements.

+ unsigned remaining = pixelCount;

+ remaining -= counter;

+ if (remaining != 0) {

+ RGBAtoRGBScalar(input, remaining, output);

+ }

+#endif

struct JPEGOutputBuffer : public jpeg_destination_mgr {

DISALLOW_NEW();

Vector<unsigned char>* output;

@@ -95,25 +187,6 @@ static void handleError(j_common_ptr common) {

longjmp(*jumpBufferPtr, -1);

}

-static void RGBAtoRGB(const unsigned char* pixels,

- unsigned pixelCount,

- unsigned char* output) {

- // Per <canvas> spec, composite the input image pixels source-over on black.

- for (; pixelCount-- > 0; pixels += 4) {

- unsigned char alpha = pixels[3];

- if (alpha != 255) {

- *output++ = SkMulDiv255Round(pixels[0], alpha);

- *output++ = SkMulDiv255Round(pixels[1], alpha);

- *output++ = SkMulDiv255Round(pixels[2], alpha);

- } else {

- *output++ = pixels[0];

- *output++ = pixels[1];

- *output++ = pixels[2];

- }

static void disableSubsamplingForHighQuality(jpeg_compress_struct* cinfo,

int quality) {

if (quality < 100)