Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(81)

Unified Diff: src/compiler/arm/code-generator-arm.cc

Issue 2856363003: [ARM] Implement irregular vector shuffles for SIMD. (Closed)
Patch Set: Review comments. Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | src/compiler/arm/instruction-codes-arm.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/compiler/arm/code-generator-arm.cc
diff --git a/src/compiler/arm/code-generator-arm.cc b/src/compiler/arm/code-generator-arm.cc
index 331a866662e3210086caa934389785823ab9b186..b69a70cd3360ce2ba7d2a889c2fede3a0ed30870 100644
--- a/src/compiler/arm/code-generator-arm.cc
+++ b/src/compiler/arm/code-generator-arm.cc
@@ -347,6 +347,14 @@ Condition FlagsConditionToCondition(FlagsCondition condition) {
return kNoCondition;
}
+int GetVtblTableSize(const Simd128Register& src0, const Simd128Register& src1) {
+ // If unary shuffle, table is src0 (2 d-registers).
+ if (src0.is(src1)) return 2;
+ // Binary shuffle, table is src0, src1. They must be consecutive
+ DCHECK_EQ(src0.code() + 1, src1.code());
+ return 4; // 4 d-registers.
+}
+
} // namespace
#define ASSEMBLE_CHECKED_LOAD_FP(Type) \
@@ -2186,6 +2194,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6]
break;
}
+ case kArmS32x4Shuffle: {
+ Simd128Register dst = i.OutputSimd128Register(),
+ src0 = i.InputSimd128Register(0),
+ src1 = i.InputSimd128Register(1);
+ // Check for in-place shuffles.
+ // If dst == src0 == src1, then the shuffle is unary and we only use src0.
+ if (dst.is(src0)) {
+ __ vmov(kScratchQuadReg, src0);
+ src0 = kScratchQuadReg;
+ } else if (dst.is(src1)) {
+ __ vmov(kScratchQuadReg, src1);
+ src1 = kScratchQuadReg;
+ }
+ // Perform shuffle as a vmov per lane.
+ int dst_code = dst.code() * 4;
+ int src0_code = src0.code() * 4;
+ int src1_code = src1.code() * 4;
+ int32_t shuffle = i.InputInt32(2);
+ for (int i = 0; i < 4; i++) {
+ int lane = shuffle & 0x7;
+ int src_code = src0_code;
+ if (lane >= 4) {
+ src_code = src1_code;
+ lane &= 0x3;
+ }
+ __ VmovExtended(dst_code + i, src_code + lane, kScratchReg);
+ shuffle >>= 8;
+ }
+ break;
+ }
case kArmS32x4TransposeRight: {
Simd128Register dst = i.OutputSimd128Register(),
src1 = i.InputSimd128Register(1);
@@ -2249,6 +2287,39 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15]
break;
}
+ case kArmS16x8Shuffle: {
+ Simd128Register dst = i.OutputSimd128Register(),
+ src0 = i.InputSimd128Register(0),
+ src1 = i.InputSimd128Register(1);
+ DwVfpRegister table_base = src0.low();
+ int table_size = GetVtblTableSize(src0, src1);
+ // Convert the shuffle lane masks to byte masks in kScratchQuadReg.
+ int scratch_s_base = kScratchQuadReg.code() * 4;
+ for (int j = 0; j < 2; j++) {
+ int32_t four_lanes = i.InputInt32(2 + j);
+ for (int k = 0; k < 2; k++) {
+ uint8_t w0 = (four_lanes & 0xF) * kShortSize;
+ four_lanes >>= 8;
+ uint8_t w1 = (four_lanes & 0xF) * kShortSize;
+ four_lanes >>= 8;
+ int32_t mask = w0 | ((w0 + 1) << 8) | (w1 << 16) | ((w1 + 1) << 24);
+ // Ensure byte indices are in [0, 31] so masks are never NaNs.
+ four_lanes &= 0x1F1F1F1F;
+ __ vmov(SwVfpRegister::from_code(scratch_s_base + 2 * j + k),
+ bit_cast<float>(mask));
+ }
+ }
+ NeonListOperand table(table_base, table_size);
+ if (!dst.is(src0) && !dst.is(src1)) {
+ __ vtbl(dst.low(), table, kScratchQuadReg.low());
+ __ vtbl(dst.high(), table, kScratchQuadReg.high());
+ } else {
+ __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());
+ __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());
+ __ vmov(dst, kScratchQuadReg);
+ }
+ break;
+ }
case kArmS8x16ZipLeft: {
Simd128Register dst = i.OutputSimd128Register(),
src1 = i.InputSimd128Register(1);
@@ -2308,6 +2379,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1), i.InputInt4(2));
break;
}
+ case kArmS8x16Shuffle: {
+ Simd128Register dst = i.OutputSimd128Register(),
+ src0 = i.InputSimd128Register(0),
+ src1 = i.InputSimd128Register(1);
+ DwVfpRegister table_base = src0.low();
+ int table_size = GetVtblTableSize(src0, src1);
+ // The shuffle lane mask is a byte mask, materialize in kScratchQuadReg.
+ int scratch_s_base = kScratchQuadReg.code() * 4;
+ for (int j = 0; j < 4; j++) {
+ int32_t four_lanes = i.InputInt32(2 + j);
+ // Ensure byte indices are in [0, 31] so masks are never NaNs.
+ four_lanes &= 0x1F1F1F1F;
+ __ vmov(SwVfpRegister::from_code(scratch_s_base + j),
+ bit_cast<float>(four_lanes));
+ }
+ NeonListOperand table(table_base, table_size);
+ if (!dst.is(src0) && !dst.is(src1)) {
+ __ vtbl(dst.low(), table, kScratchQuadReg.low());
+ __ vtbl(dst.high(), table, kScratchQuadReg.high());
+ } else {
+ __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());
+ __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());
+ __ vmov(dst, kScratchQuadReg);
+ }
+ break;
+ }
case kArmS32x2Reverse: {
__ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
« no previous file with comments | « no previous file | src/compiler/arm/instruction-codes-arm.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698