src/compiler/arm/code-generator-arm.cc - Issue 2856363003: [ARM] Implement irregular vector shuffles for SIMD.

Unified Diff: src/compiler/arm/code-generator-arm.cc

Issue 2856363003: [ARM] Implement irregular vector shuffles for SIMD. (Closed)

Patch Set: Factor out more common shuffle code. Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/compiler/arm/code-generator-arm.cc

diff --git a/src/compiler/arm/code-generator-arm.cc b/src/compiler/arm/code-generator-arm.cc

index 331a866662e3210086caa934389785823ab9b186..82e04987e656cf44a74afadd4afa630d4839c71d 100644

--- a/src/compiler/arm/code-generator-arm.cc

+++ b/src/compiler/arm/code-generator-arm.cc

@@ -347,6 +347,14 @@ Condition FlagsConditionToCondition(FlagsCondition condition) {

return kNoCondition;

}

+int GetVtblTableSize(const Simd128Register& src0, const Simd128Register& src1) {

+ // If unary shuffle, table is src0 (2 d-registers).

+ if (src0.is(src1)) return 2;

+ // Binary shuffle, table is src0, src1. They must be consecutive

+ DCHECK_EQ(src0.code() + 1, src1.code());

+ return 4; // 4 d-registers.

} // namespace

#define ASSEMBLE_CHECKED_LOAD_FP(Type) \

@@ -2186,6 +2194,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(

__ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6]

break;

}

+ case kArmS32x4Shuffle: {

+ Simd128Register dst = i.OutputSimd128Register(),

+ src0 = i.InputSimd128Register(0),

+ src1 = i.InputSimd128Register(1);

+ // Check for in-place shuffles.

+ // If dst == src0 == src1, then the shuffle is unary and we only use src0.

+ if (dst.is(src0)) {

+ __ vmov(kScratchQuadReg, src0);

+ src0 = kScratchQuadReg;

+ } else if (dst.is(src1)) {

+ __ vmov(kScratchQuadReg, src1);

+ src1 = kScratchQuadReg;

+ }

+ // Perform shuffle as a vmov per lane.

+ int dst_code = dst.code() * 4;

+ int src0_code = src0.code() * 4;

+ int src1_code = src1.code() * 4;

+ int32_t shuffle = i.InputInt32(2);

+ for (int i = 0; i < 4; i++) {

+ int lane = shuffle & 0x7;

+ int src_code = src0_code;

+ if (lane >= 4) {

+ src_code = src1_code;

+ lane &= 0x3;

+ }

+ __ VmovExtended(dst_code + i, src_code + lane, kScratchReg);

martyn.capewell 2017/05/05 14:28:49 This will become expensive when each s-register mo

bbudge 2017/05/05 20:36:28 Yes, I think I have a TODO to improve VMovExtended

+ shuffle >>= 8;

+ }

+ break;

+ }

case kArmS32x4TransposeRight: {

Simd128Register dst = i.OutputSimd128Register(),

src1 = i.InputSimd128Register(1);

@@ -2249,6 +2287,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(

__ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15]

break;

}

+ case kArmS16x8Shuffle: {

+ Simd128Register dst = i.OutputSimd128Register(),

+ src0 = i.InputSimd128Register(0),

+ src1 = i.InputSimd128Register(1);

+ DwVfpRegister table_base = src0.low();

+ int table_size = GetVtblTableSize(src0, src1);

+ // Convert the shuffle lane masks to byte masks in kScratchQuadReg.

+ int scratch_s_base = kScratchQuadReg.code() * 4;

+ for (int j = 0; j < 2; j++) {

+ int32_t four_lanes = i.InputInt32(2 + j);

+ for (int k = 0; k < 2; k++) {

+ uint8_t w0 = (four_lanes & 0xFF) * kShortSize;

+ four_lanes >>= 8;

+ uint8_t w1 = (four_lanes & 0xFF) * kShortSize;

+ four_lanes >>= 8;

+ int32_t mask = w0 | ((w0 + 1) << 8) | (w1 << 16) | ((w1 + 1) << 24);

+ __ vmov(SwVfpRegister::from_code(scratch_s_base + 2 * j + k),

+ bit_cast<float>(mask));

martyn.capewell 2017/05/05 14:28:49 This may cause a problem - if your mask looks like

bbudge 2017/05/05 20:36:28 I could also finesse the code so NaNs can't be gen

martyn.capewell 2017/05/08 13:33:53 That will fix the NaN problem. However, the assemb

+ }

+ NeonListOperand table(table_base, table_size);

+ __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());

+ __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());

+ __ vmov(dst, kScratchQuadReg);

martyn.capewell 2017/05/05 14:28:49 If you know dst doesn't alias src0 or src1, vtbl c

bbudge 2017/05/05 20:36:28 Nice, done.

+ break;

+ }

case kArmS8x16ZipLeft: {

Simd128Register dst = i.OutputSimd128Register(),

src1 = i.InputSimd128Register(1);

@@ -2308,6 +2372,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(

i.InputSimd128Register(1), i.InputInt4(2));

break;

}

+ case kArmS8x16Shuffle: {

+ Simd128Register dst = i.OutputSimd128Register(),

+ src0 = i.InputSimd128Register(0),

+ src1 = i.InputSimd128Register(1);

+ DwVfpRegister table_base = src0.low();

+ int table_size = GetVtblTableSize(src0, src1);

+ // The shuffle lane mask is a byte mask, materialize in kScratchQuadReg.

+ int scratch_s_base = kScratchQuadReg.code() * 4;

+ for (int j = 0; j < 4; j++) {

+ int32_t four_lanes = i.InputInt32(2 + j);

+ __ vmov(SwVfpRegister::from_code(scratch_s_base + j),

+ bit_cast<float>(four_lanes));

+ }

+ NeonListOperand table(table_base, table_size);

+ __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());

+ __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());

+ __ vmov(dst, kScratchQuadReg);

+ break;

+ }

case kArmS32x2Reverse: {

__ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0));

break;

« no previous file with comments | « no previous file | src/compiler/arm/instruction-codes-arm.h » ('j') | no next file with comments »