src/compiler/arm/code-generator-arm.cc - Issue 2856363003: [ARM] Implement irregular vector shuffles for SIMD.

Unified Diff: src/compiler/arm/code-generator-arm.cc

Issue 2856363003: [ARM] Implement irregular vector shuffles for SIMD. (Closed)

Patch Set: Review comments. Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/compiler/arm/code-generator-arm.cc

diff --git a/src/compiler/arm/code-generator-arm.cc b/src/compiler/arm/code-generator-arm.cc

index 331a866662e3210086caa934389785823ab9b186..b69a70cd3360ce2ba7d2a889c2fede3a0ed30870 100644

--- a/src/compiler/arm/code-generator-arm.cc

+++ b/src/compiler/arm/code-generator-arm.cc

@@ -347,6 +347,14 @@ Condition FlagsConditionToCondition(FlagsCondition condition) {

return kNoCondition;

}

+int GetVtblTableSize(const Simd128Register& src0, const Simd128Register& src1) {

+ // If unary shuffle, table is src0 (2 d-registers).

+ if (src0.is(src1)) return 2;

+ // Binary shuffle, table is src0, src1. They must be consecutive

+ DCHECK_EQ(src0.code() + 1, src1.code());

+ return 4; // 4 d-registers.

} // namespace

#define ASSEMBLE_CHECKED_LOAD_FP(Type) \

@@ -2186,6 +2194,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(

__ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6]

break;

}

+ case kArmS32x4Shuffle: {

+ Simd128Register dst = i.OutputSimd128Register(),

+ src0 = i.InputSimd128Register(0),

+ src1 = i.InputSimd128Register(1);

+ // Check for in-place shuffles.

+ // If dst == src0 == src1, then the shuffle is unary and we only use src0.

+ if (dst.is(src0)) {

+ __ vmov(kScratchQuadReg, src0);

+ src0 = kScratchQuadReg;

+ } else if (dst.is(src1)) {

+ __ vmov(kScratchQuadReg, src1);

+ src1 = kScratchQuadReg;

+ }

+ // Perform shuffle as a vmov per lane.

+ int dst_code = dst.code() * 4;

+ int src0_code = src0.code() * 4;

+ int src1_code = src1.code() * 4;

+ int32_t shuffle = i.InputInt32(2);

+ for (int i = 0; i < 4; i++) {

+ int lane = shuffle & 0x7;

+ int src_code = src0_code;

+ if (lane >= 4) {

+ src_code = src1_code;

+ lane &= 0x3;

+ }

+ __ VmovExtended(dst_code + i, src_code + lane, kScratchReg);

+ shuffle >>= 8;

+ }

+ break;

+ }

case kArmS32x4TransposeRight: {

Simd128Register dst = i.OutputSimd128Register(),

src1 = i.InputSimd128Register(1);

@@ -2249,6 +2287,39 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(

__ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15]

break;

}

+ case kArmS16x8Shuffle: {

+ Simd128Register dst = i.OutputSimd128Register(),

+ src0 = i.InputSimd128Register(0),

+ src1 = i.InputSimd128Register(1);

+ DwVfpRegister table_base = src0.low();

+ int table_size = GetVtblTableSize(src0, src1);

+ // Convert the shuffle lane masks to byte masks in kScratchQuadReg.

+ int scratch_s_base = kScratchQuadReg.code() * 4;

+ for (int j = 0; j < 2; j++) {

+ int32_t four_lanes = i.InputInt32(2 + j);

+ for (int k = 0; k < 2; k++) {

+ uint8_t w0 = (four_lanes & 0xF) * kShortSize;

+ four_lanes >>= 8;

+ uint8_t w1 = (four_lanes & 0xF) * kShortSize;

+ four_lanes >>= 8;

+ int32_t mask = w0 | ((w0 + 1) << 8) | (w1 << 16) | ((w1 + 1) << 24);

+ // Ensure byte indices are in [0, 31] so masks are never NaNs.

+ four_lanes &= 0x1F1F1F1F;

+ __ vmov(SwVfpRegister::from_code(scratch_s_base + 2 * j + k),

+ bit_cast<float>(mask));

+ }

+ NeonListOperand table(table_base, table_size);

+ if (!dst.is(src0) && !dst.is(src1)) {

+ __ vtbl(dst.low(), table, kScratchQuadReg.low());

+ __ vtbl(dst.high(), table, kScratchQuadReg.high());

+ } else {

+ __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());

+ __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());

+ __ vmov(dst, kScratchQuadReg);

+ }

+ break;

+ }

case kArmS8x16ZipLeft: {

Simd128Register dst = i.OutputSimd128Register(),

src1 = i.InputSimd128Register(1);

@@ -2308,6 +2379,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(

i.InputSimd128Register(1), i.InputInt4(2));

break;

}

+ case kArmS8x16Shuffle: {

+ Simd128Register dst = i.OutputSimd128Register(),

+ src0 = i.InputSimd128Register(0),

+ src1 = i.InputSimd128Register(1);

+ DwVfpRegister table_base = src0.low();

+ int table_size = GetVtblTableSize(src0, src1);

+ // The shuffle lane mask is a byte mask, materialize in kScratchQuadReg.

+ int scratch_s_base = kScratchQuadReg.code() * 4;

+ for (int j = 0; j < 4; j++) {

+ int32_t four_lanes = i.InputInt32(2 + j);

+ // Ensure byte indices are in [0, 31] so masks are never NaNs.

+ four_lanes &= 0x1F1F1F1F;

+ __ vmov(SwVfpRegister::from_code(scratch_s_base + j),

+ bit_cast<float>(four_lanes));

+ }

+ NeonListOperand table(table_base, table_size);

+ if (!dst.is(src0) && !dst.is(src1)) {

+ __ vtbl(dst.low(), table, kScratchQuadReg.low());

+ __ vtbl(dst.high(), table, kScratchQuadReg.high());

+ } else {

+ __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());

+ __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());

+ __ vmov(dst, kScratchQuadReg);

+ }

+ break;

+ }

case kArmS32x2Reverse: {

__ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0));

break;

« no previous file with comments | « no previous file | src/compiler/arm/instruction-codes-arm.h » ('j') | no next file with comments »