Chromium Code Reviews| Index: src/arm/macro-assembler-arm.cc |
| diff --git a/src/arm/macro-assembler-arm.cc b/src/arm/macro-assembler-arm.cc |
| index 30fb52729d0df377afc7b0404d0e81d2f94890e4..44bc66466dc29c8b972898daa156252278a0a9ca 100644 |
| --- a/src/arm/macro-assembler-arm.cc |
| +++ b/src/arm/macro-assembler-arm.cc |
| @@ -1079,47 +1079,78 @@ void MacroAssembler::VmovExtended(int dst_code, Register src) { |
| } |
| } |
| -void MacroAssembler::VmovExtended(int dst_code, int src_code, |
| - Register scratch) { |
| +void MacroAssembler::VmovExtended(int dst_code, int src_code) { |
| if (src_code < SwVfpRegister::kMaxNumRegisters && |
| dst_code < SwVfpRegister::kMaxNumRegisters) { |
| // src and dst are both s-registers. |
| vmov(SwVfpRegister::from_code(dst_code), |
| SwVfpRegister::from_code(src_code)); |
| - } else if (src_code < SwVfpRegister::kMaxNumRegisters) { |
| - // src is an s-register. |
| - vmov(scratch, SwVfpRegister::from_code(src_code)); |
| - VmovExtended(dst_code, scratch); |
| + return; |
| + } |
| + DwVfpRegister dst_d_reg = DwVfpRegister::from_code(dst_code / 2); |
| + DwVfpRegister src_d_reg = DwVfpRegister::from_code(src_code / 2); |
| + int dst_offset = dst_code & 1; |
| + int src_offset = src_code & 1; |
| + if (CpuFeatures::IsSupported(NEON)) { |
| + // On Neon we can shift and insert from d-registers. |
| + if (src_offset == dst_offset) { |
| + // Offsets are the same, use vdup to copy the source to the opposite lane. |
| + vdup(Neon32, kScratchDoubleReg, src_d_reg, src_offset); |
| + src_d_reg = kScratchDoubleReg; |
| + src_offset = dst_offset ^ 1; |
| + } |
|
martyn.capewell
2017/05/10 12:38:06
There's a tiny optimisation here: as vdup is sligh
bbudge
2017/05/10 17:54:52
Nice. Done.
|
| + if (dst_offset) { |
| + vsli(Neon64, dst_d_reg, src_d_reg, 32); |
| + } else { |
| + vsri(Neon64, dst_d_reg, src_d_reg, 32); |
| + } |
| + return; |
| + } |
| + |
| + // Without Neon, use the scratch registers to move src and/or dst into |
| + // s-registers. |
| + int scratchSCode = kScratchDoubleReg.low().code(); |
| + int scratchSCode2 = kScratchDoubleReg2.low().code(); |
| + if (src_code < SwVfpRegister::kMaxNumRegisters) { |
| + // src is an s-register, dst is not. |
| + vmov(kScratchDoubleReg, dst_d_reg); |
| + vmov(SwVfpRegister::from_code(scratchSCode + dst_offset), |
| + SwVfpRegister::from_code(src_code)); |
| + vmov(dst_d_reg, kScratchDoubleReg); |
| } else if (dst_code < SwVfpRegister::kMaxNumRegisters) { |
| - // dst is an s-register. |
| - VmovExtended(scratch, src_code); |
| - vmov(SwVfpRegister::from_code(dst_code), scratch); |
| + // dst is an s-register, src is not. |
| + vmov(kScratchDoubleReg, src_d_reg); |
| + vmov(SwVfpRegister::from_code(dst_code), |
| + SwVfpRegister::from_code(scratchSCode + src_offset)); |
| } else { |
| - // Neither src or dst are s-registers. |
| - DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code); |
| - DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code); |
| - VmovExtended(scratch, src_code); |
| - VmovExtended(dst_code, scratch); |
| + // Neither src or dst are s-registers. Both scratch double registers are |
| + // available when there are 32 VFP registers. |
| + vmov(kScratchDoubleReg, src_d_reg); |
| + vmov(kScratchDoubleReg2, dst_d_reg); |
| + vmov(SwVfpRegister::from_code(scratchSCode + dst_offset), |
| + SwVfpRegister::from_code(scratchSCode2 + src_offset)); |
| + vmov(dst_d_reg, kScratchQuadReg.high()); |
| } |
| } |
| -void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src, |
| - Register scratch) { |
| - if (dst_code >= SwVfpRegister::kMaxNumRegisters) { |
| - ldr(scratch, src); |
| - VmovExtended(dst_code, scratch); |
| - } else { |
| +void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src) { |
| + if (dst_code < SwVfpRegister::kMaxNumRegisters) { |
| vldr(SwVfpRegister::from_code(dst_code), src); |
| + } else { |
| + int dst_s_code = kScratchDoubleReg.low().code() + (dst_code & 1); |
|
martyn.capewell
2017/05/10 12:38:06
When supported, this can use the "single element t
bbudge
2017/05/10 17:54:52
Added TODO. Thanks.
|
| + vmov(kScratchDoubleReg, DwVfpRegister::from_code(dst_code / 2)); |
| + vldr(SwVfpRegister::from_code(dst_s_code), src); |
| + vmov(DwVfpRegister::from_code(dst_code / 2), kScratchDoubleReg); |
| } |
| } |
| -void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code, |
| - Register scratch) { |
| - if (src_code >= SwVfpRegister::kMaxNumRegisters) { |
| - VmovExtended(scratch, src_code); |
| - str(scratch, dst); |
| - } else { |
| +void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code) { |
| + if (src_code < SwVfpRegister::kMaxNumRegisters) { |
| vstr(SwVfpRegister::from_code(src_code), dst); |
| + } else { |
| + int src_s_code = kScratchDoubleReg.low().code() + (src_code & 1); |
| + vmov(kScratchDoubleReg, DwVfpRegister::from_code(src_code / 2)); |
| + vstr(SwVfpRegister::from_code(src_s_code), dst); |
| } |
| } |
| @@ -1145,9 +1176,9 @@ void MacroAssembler::ExtractLane(Register dst, DwVfpRegister src, |
| } |
| void MacroAssembler::ExtractLane(SwVfpRegister dst, QwNeonRegister src, |
| - Register scratch, int lane) { |
| + int lane) { |
| int s_code = src.code() * 4 + lane; |
| - VmovExtended(dst.code(), s_code, scratch); |
| + VmovExtended(dst.code(), s_code); |
| } |
| void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src, |
| @@ -1164,11 +1195,10 @@ void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src, |
| } |
| void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src, |
| - SwVfpRegister src_lane, Register scratch, |
| - int lane) { |
| + SwVfpRegister src_lane, int lane) { |
| Move(dst, src); |
| int s_code = dst.code() * 4 + lane; |
| - VmovExtended(s_code, src_lane.code(), scratch); |
| + VmovExtended(s_code, src_lane.code()); |
| } |
| void MacroAssembler::LslPair(Register dst_low, Register dst_high, |