Index: src/compiler/arm/code-generator-arm.cc |
diff --git a/src/compiler/arm/code-generator-arm.cc b/src/compiler/arm/code-generator-arm.cc |
index 331a866662e3210086caa934389785823ab9b186..b69a70cd3360ce2ba7d2a889c2fede3a0ed30870 100644 |
--- a/src/compiler/arm/code-generator-arm.cc |
+++ b/src/compiler/arm/code-generator-arm.cc |
@@ -347,6 +347,14 @@ Condition FlagsConditionToCondition(FlagsCondition condition) { |
return kNoCondition; |
} |
+int GetVtblTableSize(const Simd128Register& src0, const Simd128Register& src1) { |
+ // If unary shuffle, table is src0 (2 d-registers). |
+ if (src0.is(src1)) return 2; |
+ // Binary shuffle, table is src0, src1. They must be consecutive |
+ DCHECK_EQ(src0.code() + 1, src1.code()); |
+ return 4; // 4 d-registers. |
+} |
+ |
} // namespace |
#define ASSEMBLE_CHECKED_LOAD_FP(Type) \ |
@@ -2186,6 +2194,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( |
__ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6] |
break; |
} |
+ case kArmS32x4Shuffle: { |
+ Simd128Register dst = i.OutputSimd128Register(), |
+ src0 = i.InputSimd128Register(0), |
+ src1 = i.InputSimd128Register(1); |
+ // Check for in-place shuffles. |
+ // If dst == src0 == src1, then the shuffle is unary and we only use src0. |
+ if (dst.is(src0)) { |
+ __ vmov(kScratchQuadReg, src0); |
+ src0 = kScratchQuadReg; |
+ } else if (dst.is(src1)) { |
+ __ vmov(kScratchQuadReg, src1); |
+ src1 = kScratchQuadReg; |
+ } |
+ // Perform shuffle as a vmov per lane. |
+ int dst_code = dst.code() * 4; |
+ int src0_code = src0.code() * 4; |
+ int src1_code = src1.code() * 4; |
+ int32_t shuffle = i.InputInt32(2); |
+ for (int i = 0; i < 4; i++) { |
+ int lane = shuffle & 0x7; |
+ int src_code = src0_code; |
+ if (lane >= 4) { |
+ src_code = src1_code; |
+ lane &= 0x3; |
+ } |
+ __ VmovExtended(dst_code + i, src_code + lane, kScratchReg); |
+ shuffle >>= 8; |
+ } |
+ break; |
+ } |
case kArmS32x4TransposeRight: { |
Simd128Register dst = i.OutputSimd128Register(), |
src1 = i.InputSimd128Register(1); |
@@ -2249,6 +2287,39 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( |
__ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15] |
break; |
} |
+ case kArmS16x8Shuffle: { |
+ Simd128Register dst = i.OutputSimd128Register(), |
+ src0 = i.InputSimd128Register(0), |
+ src1 = i.InputSimd128Register(1); |
+ DwVfpRegister table_base = src0.low(); |
+ int table_size = GetVtblTableSize(src0, src1); |
+ // Convert the shuffle lane masks to byte masks in kScratchQuadReg. |
+ int scratch_s_base = kScratchQuadReg.code() * 4; |
+ for (int j = 0; j < 2; j++) { |
+ int32_t four_lanes = i.InputInt32(2 + j); |
+ for (int k = 0; k < 2; k++) { |
+ uint8_t w0 = (four_lanes & 0xF) * kShortSize; |
+ four_lanes >>= 8; |
+ uint8_t w1 = (four_lanes & 0xF) * kShortSize; |
+ four_lanes >>= 8; |
+ int32_t mask = w0 | ((w0 + 1) << 8) | (w1 << 16) | ((w1 + 1) << 24); |
+ // Ensure byte indices are in [0, 31] so masks are never NaNs. |
+ four_lanes &= 0x1F1F1F1F; |
+ __ vmov(SwVfpRegister::from_code(scratch_s_base + 2 * j + k), |
+ bit_cast<float>(mask)); |
+ } |
+ } |
+ NeonListOperand table(table_base, table_size); |
+ if (!dst.is(src0) && !dst.is(src1)) { |
+ __ vtbl(dst.low(), table, kScratchQuadReg.low()); |
+ __ vtbl(dst.high(), table, kScratchQuadReg.high()); |
+ } else { |
+ __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low()); |
+ __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high()); |
+ __ vmov(dst, kScratchQuadReg); |
+ } |
+ break; |
+ } |
case kArmS8x16ZipLeft: { |
Simd128Register dst = i.OutputSimd128Register(), |
src1 = i.InputSimd128Register(1); |
@@ -2308,6 +2379,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( |
i.InputSimd128Register(1), i.InputInt4(2)); |
break; |
} |
+ case kArmS8x16Shuffle: { |
+ Simd128Register dst = i.OutputSimd128Register(), |
+ src0 = i.InputSimd128Register(0), |
+ src1 = i.InputSimd128Register(1); |
+ DwVfpRegister table_base = src0.low(); |
+ int table_size = GetVtblTableSize(src0, src1); |
+ // The shuffle lane mask is a byte mask, materialize in kScratchQuadReg. |
+ int scratch_s_base = kScratchQuadReg.code() * 4; |
+ for (int j = 0; j < 4; j++) { |
+ int32_t four_lanes = i.InputInt32(2 + j); |
+ // Ensure byte indices are in [0, 31] so masks are never NaNs. |
+ four_lanes &= 0x1F1F1F1F; |
+ __ vmov(SwVfpRegister::from_code(scratch_s_base + j), |
+ bit_cast<float>(four_lanes)); |
+ } |
+ NeonListOperand table(table_base, table_size); |
+ if (!dst.is(src0) && !dst.is(src1)) { |
+ __ vtbl(dst.low(), table, kScratchQuadReg.low()); |
+ __ vtbl(dst.high(), table, kScratchQuadReg.high()); |
+ } else { |
+ __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low()); |
+ __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high()); |
+ __ vmov(dst, kScratchQuadReg); |
+ } |
+ break; |
+ } |
case kArmS32x2Reverse: { |
__ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0)); |
break; |