src/compiler/arm/instruction-selector-arm.cc - Issue 2769723003: [arm][turbofan] Use NEON for unaligned float64 memory accesses

Unified Diff: src/compiler/arm/instruction-selector-arm.cc

Issue 2769723003: [arm][turbofan] Use NEON for unaligned float64 memory accesses (Closed)

Patch Set: Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/compiler/arm/instruction-selector-arm.cc

diff --git a/src/compiler/arm/instruction-selector-arm.cc b/src/compiler/arm/instruction-selector-arm.cc

index fb04eb1cf44c929eebdb9899e9a25a1db8a42a44..b9cd35ad9a921fad503c2b6c0d233580e5355c92 100644

--- a/src/compiler/arm/instruction-selector-arm.cc

+++ b/src/compiler/arm/instruction-selector-arm.cc

@@ -560,37 +560,41 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {

return;

}

case MachineRepresentation::kFloat64: {

- // TODO(arm): use vld1.8 for this when NEON is available.

- // Compute the address of the least-significant half of the FP value.

- // We assume that the base node is unlikely to be an encodable immediate

- // or the result of a shift operation, so only consider the addressing

- // mode that should be used for the index node.

- InstructionCode add_opcode = kArmAdd;

- InstructionOperand inputs[3];

- inputs[0] = g.UseRegister(base);

- size_t input_count;

- if (TryMatchImmediateOrShift(this, &add_opcode, index, &input_count,

- &inputs[1])) {

- // input_count has been set by TryMatchImmediateOrShift(), so increment

- // it to account for the base register in inputs[0].

- input_count++;

+ if (CpuFeatures::IsSupported(NEON)) {

+ InstructionOperand output = g.DefineAsRegister(node);

+ EmitLoad(this, kArmVld1F64, &output, base, index);

} else {

- add_opcode |= AddressingModeField::encode(kMode_Operand2_R);

- inputs[1] = g.UseRegister(index);

- input_count = 2; // Base register and index.

- }

+ // Compute the address of the least-significant half of the FP value.

+ // We assume that the base node is unlikely to be an encodable immediate

+ // or the result of a shift operation, so only consider the addressing

+ // mode that should be used for the index node.

+ InstructionCode add_opcode = kArmAdd;

+ InstructionOperand inputs[3];

+ inputs[0] = g.UseRegister(base);

+ size_t input_count;

+ if (TryMatchImmediateOrShift(this, &add_opcode, index, &input_count,

+ &inputs[1])) {

+ // input_count has been set by TryMatchImmediateOrShift(), so

+ // increment it to account for the base register in inputs[0].

+ input_count++;

+ } else {

+ add_opcode |= AddressingModeField::encode(kMode_Operand2_R);

+ inputs[1] = g.UseRegister(index);

+ input_count = 2; // Base register and index.

+ }

- InstructionOperand addr = g.TempRegister();

- Emit(add_opcode, 1, &addr, input_count, inputs);

+ InstructionOperand addr = g.TempRegister();

+ Emit(add_opcode, 1, &addr, input_count, inputs);

- // Load both halves and move to an FP register.

- InstructionOperand fp_lo = g.TempRegister();

- InstructionOperand fp_hi = g.TempRegister();

- opcode |= AddressingModeField::encode(kMode_Offset_RI);

- Emit(opcode, fp_lo, addr, g.TempImmediate(0));

- Emit(opcode, fp_hi, addr, g.TempImmediate(4));

- Emit(kArmVmovF64U32U32, g.DefineAsRegister(node), fp_lo, fp_hi);

+ // Load both halves and move to an FP register.

+ InstructionOperand fp_lo = g.TempRegister();

+ InstructionOperand fp_hi = g.TempRegister();

+ opcode |= AddressingModeField::encode(kMode_Offset_RI);

+ Emit(opcode, fp_lo, addr, g.TempImmediate(0));

+ Emit(opcode, fp_hi, addr, g.TempImmediate(4));

+ Emit(kArmVmovF64U32U32, g.DefineAsRegister(node), fp_lo, fp_hi);

+ }

return;

}

default:

@@ -624,30 +628,35 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {

return;

}

case MachineRepresentation::kFloat64: {

- // TODO(arm): use vst1.8 for this when NEON is available.

- // Store a 64-bit floating point value using two 32-bit integer stores.

- // Computing the store address here would require three live temporary

- // registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after

- // storing the least-significant half of the value.

- // First, move the 64-bit FP value into two temporary integer registers.

- InstructionOperand fp[] = {g.TempRegister(), g.TempRegister()};

- inputs[input_count++] = g.UseRegister(value);

- Emit(kArmVmovU32U32F64, arraysize(fp), fp, input_count,

- inputs);

- // Store the least-significant half.

- inputs[0] = fp[0]; // Low 32-bits of FP value.

- inputs[input_count++] = g.UseRegister(base); // First store base address.

- EmitStore(this, kArmStr, input_count, inputs, index);

- // Store the most-significant half.

- InstructionOperand base4 = g.TempRegister();

- Emit(kArmAdd | AddressingModeField::encode(kMode_Operand2_I), base4,

- g.UseRegister(base), g.TempImmediate(4)); // Compute base + 4.

- inputs[0] = fp[1]; // High 32-bits of FP value.

- inputs[1] = base4; // Second store base + 4 address.

- EmitStore(this, kArmStr, input_count, inputs, index);

+ if (CpuFeatures::IsSupported(NEON)) {

+ inputs[input_count++] = g.UseRegister(value);

+ inputs[input_count++] = g.UseRegister(base);

+ EmitStore(this, kArmVst1F64, input_count, inputs, index);

+ } else {

+ // Store a 64-bit floating point value using two 32-bit integer stores.

+ // Computing the store address here would require three live temporary

+ // registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after

+ // storing the least-significant half of the value.

+ // First, move the 64-bit FP value into two temporary integer registers.

+ InstructionOperand fp[] = {g.TempRegister(), g.TempRegister()};

+ inputs[input_count++] = g.UseRegister(value);

+ Emit(kArmVmovU32U32F64, arraysize(fp), fp, input_count, inputs);

+ // Store the least-significant half.

+ inputs[0] = fp[0]; // Low 32-bits of FP value.

+ inputs[input_count++] =

+ g.UseRegister(base); // First store base address.

+ EmitStore(this, kArmStr, input_count, inputs, index);

+ // Store the most-significant half.

+ InstructionOperand base4 = g.TempRegister();

+ Emit(kArmAdd | AddressingModeField::encode(kMode_Operand2_I), base4,

+ g.UseRegister(base), g.TempImmediate(4)); // Compute base + 4.

+ inputs[0] = fp[1]; // High 32-bits of FP value.

+ inputs[1] = base4; // Second store base + 4 address.

+ EmitStore(this, kArmStr, input_count, inputs, index);

+ }

return;

}

default:

« src/compiler/arm/code-generator-arm.cc ('K') | « src/compiler/arm/instruction-scheduler-arm.cc ('k') | no next file » | no next file with comments »