Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(184)

Unified Diff: src/compiler/arm/code-generator-arm.cc

Issue 2856363003: [ARM] Implement irregular vector shuffles for SIMD. (Closed)
Patch Set: Factor out more common shuffle code. Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | src/compiler/arm/instruction-codes-arm.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/compiler/arm/code-generator-arm.cc
diff --git a/src/compiler/arm/code-generator-arm.cc b/src/compiler/arm/code-generator-arm.cc
index 331a866662e3210086caa934389785823ab9b186..82e04987e656cf44a74afadd4afa630d4839c71d 100644
--- a/src/compiler/arm/code-generator-arm.cc
+++ b/src/compiler/arm/code-generator-arm.cc
@@ -347,6 +347,14 @@ Condition FlagsConditionToCondition(FlagsCondition condition) {
return kNoCondition;
}
+int GetVtblTableSize(const Simd128Register& src0, const Simd128Register& src1) {
+ // If unary shuffle, table is src0 (2 d-registers).
+ if (src0.is(src1)) return 2;
+ // Binary shuffle, table is src0, src1. They must be consecutive
+ DCHECK_EQ(src0.code() + 1, src1.code());
+ return 4; // 4 d-registers.
+}
+
} // namespace
#define ASSEMBLE_CHECKED_LOAD_FP(Type) \
@@ -2186,6 +2194,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6]
break;
}
+ case kArmS32x4Shuffle: {
+ Simd128Register dst = i.OutputSimd128Register(),
+ src0 = i.InputSimd128Register(0),
+ src1 = i.InputSimd128Register(1);
+ // Check for in-place shuffles.
+ // If dst == src0 == src1, then the shuffle is unary and we only use src0.
+ if (dst.is(src0)) {
+ __ vmov(kScratchQuadReg, src0);
+ src0 = kScratchQuadReg;
+ } else if (dst.is(src1)) {
+ __ vmov(kScratchQuadReg, src1);
+ src1 = kScratchQuadReg;
+ }
+ // Perform shuffle as a vmov per lane.
+ int dst_code = dst.code() * 4;
+ int src0_code = src0.code() * 4;
+ int src1_code = src1.code() * 4;
+ int32_t shuffle = i.InputInt32(2);
+ for (int i = 0; i < 4; i++) {
+ int lane = shuffle & 0x7;
+ int src_code = src0_code;
+ if (lane >= 4) {
+ src_code = src1_code;
+ lane &= 0x3;
+ }
+ __ VmovExtended(dst_code + i, src_code + lane, kScratchReg);
martyn.capewell 2017/05/05 14:28:49 This will become expensive when each s-register mo
bbudge 2017/05/05 20:36:28 Yes, I think I have a TODO to improve VMovExtended
+ shuffle >>= 8;
+ }
+ break;
+ }
case kArmS32x4TransposeRight: {
Simd128Register dst = i.OutputSimd128Register(),
src1 = i.InputSimd128Register(1);
@@ -2249,6 +2287,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15]
break;
}
+ case kArmS16x8Shuffle: {
+ Simd128Register dst = i.OutputSimd128Register(),
+ src0 = i.InputSimd128Register(0),
+ src1 = i.InputSimd128Register(1);
+ DwVfpRegister table_base = src0.low();
+ int table_size = GetVtblTableSize(src0, src1);
+ // Convert the shuffle lane masks to byte masks in kScratchQuadReg.
+ int scratch_s_base = kScratchQuadReg.code() * 4;
+ for (int j = 0; j < 2; j++) {
+ int32_t four_lanes = i.InputInt32(2 + j);
+ for (int k = 0; k < 2; k++) {
+ uint8_t w0 = (four_lanes & 0xFF) * kShortSize;
+ four_lanes >>= 8;
+ uint8_t w1 = (four_lanes & 0xFF) * kShortSize;
+ four_lanes >>= 8;
+ int32_t mask = w0 | ((w0 + 1) << 8) | (w1 << 16) | ((w1 + 1) << 24);
+ __ vmov(SwVfpRegister::from_code(scratch_s_base + 2 * j + k),
+ bit_cast<float>(mask));
martyn.capewell 2017/05/05 14:28:49 This may cause a problem - if your mask looks like
bbudge 2017/05/05 20:36:28 I could also finesse the code so NaNs can't be gen
martyn.capewell 2017/05/08 13:33:53 That will fix the NaN problem. However, the assemb
+ }
+ }
+ NeonListOperand table(table_base, table_size);
+ __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());
+ __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());
+ __ vmov(dst, kScratchQuadReg);
martyn.capewell 2017/05/05 14:28:49 If you know dst doesn't alias src0 or src1, vtbl c
bbudge 2017/05/05 20:36:28 Nice, done.
+ break;
+ }
case kArmS8x16ZipLeft: {
Simd128Register dst = i.OutputSimd128Register(),
src1 = i.InputSimd128Register(1);
@@ -2308,6 +2372,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1), i.InputInt4(2));
break;
}
+ case kArmS8x16Shuffle: {
+ Simd128Register dst = i.OutputSimd128Register(),
+ src0 = i.InputSimd128Register(0),
+ src1 = i.InputSimd128Register(1);
+ DwVfpRegister table_base = src0.low();
+ int table_size = GetVtblTableSize(src0, src1);
+ // The shuffle lane mask is a byte mask, materialize in kScratchQuadReg.
+ int scratch_s_base = kScratchQuadReg.code() * 4;
+ for (int j = 0; j < 4; j++) {
+ int32_t four_lanes = i.InputInt32(2 + j);
+ __ vmov(SwVfpRegister::from_code(scratch_s_base + j),
+ bit_cast<float>(four_lanes));
+ }
+ NeonListOperand table(table_base, table_size);
+ __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());
+ __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());
+ __ vmov(dst, kScratchQuadReg);
+ break;
+ }
case kArmS32x2Reverse: {
__ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
« no previous file with comments | « no previous file | src/compiler/arm/instruction-codes-arm.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698