 Chromium Code Reviews
 Chromium Code Reviews Issue 2546933002:
  [Turbofan] Add ARM NEON instructions for implementing SIMD.  (Closed)
    
  
    Issue 2546933002:
  [Turbofan] Add ARM NEON instructions for implementing SIMD.  (Closed) 
  | OLD | NEW | 
|---|---|
| 1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. | 
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be | 
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. | 
| 4 | 4 | 
| 5 #include <limits.h> // For LONG_MIN, LONG_MAX. | 5 #include <limits.h> // For LONG_MIN, LONG_MAX. | 
| 6 | 6 | 
| 7 #if V8_TARGET_ARCH_ARM | 7 #if V8_TARGET_ARCH_ARM | 
| 8 | 8 | 
| 9 #include "src/base/bits.h" | 9 #include "src/base/bits.h" | 
| 10 #include "src/base/division-by-constant.h" | 10 #include "src/base/division-by-constant.h" | 
| (...skipping 1063 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1074 void MacroAssembler::VmovLow(DwVfpRegister dst, Register src) { | 1074 void MacroAssembler::VmovLow(DwVfpRegister dst, Register src) { | 
| 1075 if (dst.code() < 16) { | 1075 if (dst.code() < 16) { | 
| 1076 const LowDwVfpRegister loc = LowDwVfpRegister::from_code(dst.code()); | 1076 const LowDwVfpRegister loc = LowDwVfpRegister::from_code(dst.code()); | 
| 1077 vmov(loc.low(), src); | 1077 vmov(loc.low(), src); | 
| 1078 } else { | 1078 } else { | 
| 1079 vmov(dst, VmovIndexLo, src); | 1079 vmov(dst, VmovIndexLo, src); | 
| 1080 } | 1080 } | 
| 1081 } | 1081 } | 
| 1082 | 1082 | 
| 1083 void MacroAssembler::VmovExtended(Register dst, int src_code) { | 1083 void MacroAssembler::VmovExtended(Register dst, int src_code) { | 
| 1084 DCHECK_LE(32, src_code); | 1084 DCHECK_LE(SwVfpRegister::kMaxNumRegisters, src_code); | 
| 1085 DCHECK_GT(64, src_code); | 1085 DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code); | 
| 1086 if (src_code & 0x1) { | 1086 if (src_code & 0x1) { | 
| 1087 VmovHigh(dst, DwVfpRegister::from_code(src_code / 2)); | 1087 VmovHigh(dst, DwVfpRegister::from_code(src_code / 2)); | 
| 1088 } else { | 1088 } else { | 
| 1089 VmovLow(dst, DwVfpRegister::from_code(src_code / 2)); | 1089 VmovLow(dst, DwVfpRegister::from_code(src_code / 2)); | 
| 1090 } | 1090 } | 
| 1091 } | 1091 } | 
| 1092 | 1092 | 
| 1093 void MacroAssembler::VmovExtended(int dst_code, Register src) { | 1093 void MacroAssembler::VmovExtended(int dst_code, Register src) { | 
| 1094 DCHECK_LE(32, dst_code); | 1094 DCHECK_LE(SwVfpRegister::kMaxNumRegisters, dst_code); | 
| 1095 DCHECK_GT(64, dst_code); | 1095 DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code); | 
| 1096 if (dst_code & 0x1) { | 1096 if (dst_code & 0x1) { | 
| 1097 VmovHigh(DwVfpRegister::from_code(dst_code / 2), src); | 1097 VmovHigh(DwVfpRegister::from_code(dst_code / 2), src); | 
| 1098 } else { | 1098 } else { | 
| 1099 VmovLow(DwVfpRegister::from_code(dst_code / 2), src); | 1099 VmovLow(DwVfpRegister::from_code(dst_code / 2), src); | 
| 1100 } | 1100 } | 
| 1101 } | 1101 } | 
| 1102 | 1102 | 
| 1103 void MacroAssembler::VmovExtended(int dst_code, int src_code, | 1103 void MacroAssembler::VmovExtended(int dst_code, int src_code, | 
| 1104 Register scratch) { | 1104 Register scratch) { | 
| 1105 if (src_code < 32 && dst_code < 32) { | 1105 if (src_code < SwVfpRegister::kMaxNumRegisters && | 
| 1106 dst_code < SwVfpRegister::kMaxNumRegisters) { | |
| 1106 // src and dst are both s-registers. | 1107 // src and dst are both s-registers. | 
| 1107 vmov(SwVfpRegister::from_code(dst_code), | 1108 vmov(SwVfpRegister::from_code(dst_code), | 
| 1108 SwVfpRegister::from_code(src_code)); | 1109 SwVfpRegister::from_code(src_code)); | 
| 1109 } else if (src_code < 32) { | 1110 } else if (src_code < SwVfpRegister::kMaxNumRegisters) { | 
| 1110 // src is an s-register. | 1111 // src is an s-register. | 
| 1111 vmov(scratch, SwVfpRegister::from_code(src_code)); | 1112 vmov(scratch, SwVfpRegister::from_code(src_code)); | 
| 1112 VmovExtended(dst_code, scratch); | 1113 VmovExtended(dst_code, scratch); | 
| 1113 } else if (dst_code < 32) { | 1114 } else if (dst_code < SwVfpRegister::kMaxNumRegisters) { | 
| 1114 // dst is an s-register. | 1115 // dst is an s-register. | 
| 1115 VmovExtended(scratch, src_code); | 1116 VmovExtended(scratch, src_code); | 
| 1116 vmov(SwVfpRegister::from_code(dst_code), scratch); | 1117 vmov(SwVfpRegister::from_code(dst_code), scratch); | 
| 1117 } else { | 1118 } else { | 
| 1118 // Neither src or dst are s-registers. | 1119 // Neither src or dst are s-registers. | 
| 1119 DCHECK_GT(64, src_code); | 1120 DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code); | 
| 1120 DCHECK_GT(64, dst_code); | 1121 DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code); | 
| 1121 VmovExtended(scratch, src_code); | 1122 VmovExtended(scratch, src_code); | 
| 1122 VmovExtended(dst_code, scratch); | 1123 VmovExtended(dst_code, scratch); | 
| 1123 } | 1124 } | 
| 1124 } | 1125 } | 
| 1125 | 1126 | 
| 1126 void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src, | 1127 void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src, | 
| 1127 Register scratch) { | 1128 Register scratch) { | 
| 1128 if (dst_code >= 32) { | 1129 if (dst_code >= SwVfpRegister::kMaxNumRegisters) { | 
| 1129 ldr(scratch, src); | 1130 ldr(scratch, src); | 
| 1130 VmovExtended(dst_code, scratch); | 1131 VmovExtended(dst_code, scratch); | 
| 1131 } else { | 1132 } else { | 
| 1132 vldr(SwVfpRegister::from_code(dst_code), src); | 1133 vldr(SwVfpRegister::from_code(dst_code), src); | 
| 1133 } | 1134 } | 
| 1134 } | 1135 } | 
| 1135 | 1136 | 
| 1136 void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code, | 1137 void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code, | 
| 1137 Register scratch) { | 1138 Register scratch) { | 
| 1138 if (src_code >= 32) { | 1139 if (src_code >= SwVfpRegister::kMaxNumRegisters) { | 
| 1139 VmovExtended(scratch, src_code); | 1140 VmovExtended(scratch, src_code); | 
| 1140 str(scratch, dst); | 1141 str(scratch, dst); | 
| 1141 } else { | 1142 } else { | 
| 1142 vstr(SwVfpRegister::from_code(src_code), dst); | 1143 vstr(SwVfpRegister::from_code(src_code), dst); | 
| 1143 } | 1144 } | 
| 1144 } | 1145 } | 
| 1145 | 1146 | 
| 1147 void MacroAssembler::ExtractLane(Register dst, QwNeonRegister src, | |
| 
Rodolph Perfetta (ARM)
2016/12/08 18:08:28
There is a Neon instruction for this: vmov.dt rt,
 
bbudge
2016/12/10 21:33:04
Awesome, I've implemented this for the other data
 | |
| 1148 NeonDataType dt, int lane) { | |
| 1149 // Read the word containing the lane into dst. | |
| 1150 int bytes_per_lane = dt & NeonDataTypeSizeMask; | |
| 1151 int byte = (lane * bytes_per_lane); | |
| 1152 int word = byte / kPointerSize; | |
| 1153 int s_code = src.code() * 4 + word; | |
| 1154 if (s_code < SwVfpRegister::kMaxNumRegisters) { | |
| 1155 vmov(dst, SwVfpRegister::from_code(s_code)); | |
| 1156 } else { | |
| 1157 VmovExtended(dst, s_code); | |
| 1158 } | |
| 1159 if (bytes_per_lane != kPointerSize) { | |
| 1160 // Extract lane, and sign extend for signed types. | |
| 1161 int width = bytes_per_lane * kBitsPerByte; | |
| 1162 int lsb = (byte & 0x3) * kBitsPerByte; | |
| 1163 if ((dt & NeonDataTypeUMask) != 0) { | |
| 1164 Ubfx(dst, dst, lsb, width); | |
| 1165 } else { | |
| 1166 Sbfx(dst, dst, lsb, width); | |
| 1167 } | |
| 1168 } | |
| 1169 } | |
| 1170 | |
| 1171 void MacroAssembler::ExtractLane(SwVfpRegister dst, QwNeonRegister src, | |
| 1172 Register scratch, int lane) { | |
| 1173 int s_code = src.code() * 4 + lane; | |
| 1174 VmovExtended(dst.code(), s_code, scratch); | |
| 1175 } | |
| 1176 | |
| 1177 void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src, | |
| 1178 Register src_lane, Register scratch, | |
| 1179 NeonDataType dt, int lane) { | |
| 1180 Move(dst, src); | |
| 1181 int bytes_per_lane = dt & NeonDataTypeSizeMask; | |
| 
Rodolph Perfetta (ARM)
2016/12/08 18:08:28
if you implement vmov.dt Dn[x], Rt then you can dr
 
bbudge
2016/12/10 21:33:04
I implemented the Neon form of this instruction to
 | |
| 1182 int byte = (lane * bytes_per_lane); | |
| 1183 int word = byte / kPointerSize; | |
| 1184 int s_code = dst.code() * 4 + word; | |
| 1185 // If lane is word sized, just move src_lane into the containing s-register. | |
| 1186 if (bytes_per_lane == kPointerSize) { | |
| 1187 if (s_code < SwVfpRegister::kMaxNumRegisters) { | |
| 1188 vmov(SwVfpRegister::from_code(s_code), src_lane); | |
| 1189 } else { | |
| 1190 VmovExtended(s_code, src_lane); | |
| 1191 } | |
| 1192 return; | |
| 1193 } | |
| 1194 // Move the s-register containing the lane to replace into scratch register. | |
| 1195 if (s_code < SwVfpRegister::kMaxNumRegisters) { | |
| 1196 vmov(scratch, SwVfpRegister::from_code(s_code)); | |
| 1197 } else { | |
| 1198 VmovExtended(scratch, s_code); | |
| 1199 } | |
| 1200 // Combine scratch with src_lane, shifted into position. | |
| 1201 int width = bytes_per_lane * kBitsPerByte; | |
| 1202 int lsb = (byte % kPointerSize) * kBitsPerByte; | |
| 1203 bfi(scratch, src_lane, lsb, width); | |
| 1204 if (s_code < SwVfpRegister::kMaxNumRegisters) { | |
| 1205 vmov(SwVfpRegister::from_code(s_code), scratch); | |
| 1206 } else { | |
| 1207 VmovExtended(s_code, scratch); | |
| 1208 } | |
| 1209 } | |
| 1210 | |
| 1211 void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src, | |
| 1212 SwVfpRegister src_lane, Register scratch, | |
| 1213 int lane) { | |
| 1214 Move(dst, src); | |
| 1215 int s_code = dst.code() * 4 + lane; | |
| 1216 VmovExtended(s_code, src_lane.code(), scratch); | |
| 1217 } | |
| 1218 | |
| 1146 void MacroAssembler::LslPair(Register dst_low, Register dst_high, | 1219 void MacroAssembler::LslPair(Register dst_low, Register dst_high, | 
| 1147 Register src_low, Register src_high, | 1220 Register src_low, Register src_high, | 
| 1148 Register scratch, Register shift) { | 1221 Register scratch, Register shift) { | 
| 1149 DCHECK(!AreAliased(dst_high, src_low)); | 1222 DCHECK(!AreAliased(dst_high, src_low)); | 
| 1150 DCHECK(!AreAliased(dst_high, shift)); | 1223 DCHECK(!AreAliased(dst_high, shift)); | 
| 1151 | 1224 | 
| 1152 Label less_than_32; | 1225 Label less_than_32; | 
| 1153 Label done; | 1226 Label done; | 
| 1154 rsb(scratch, shift, Operand(32), SetCC); | 1227 rsb(scratch, shift, Operand(32), SetCC); | 
| 1155 b(gt, &less_than_32); | 1228 b(gt, &less_than_32); | 
| (...skipping 2735 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3891 } | 3964 } | 
| 3892 } | 3965 } | 
| 3893 if (mag.shift > 0) mov(result, Operand(result, ASR, mag.shift)); | 3966 if (mag.shift > 0) mov(result, Operand(result, ASR, mag.shift)); | 
| 3894 add(result, result, Operand(dividend, LSR, 31)); | 3967 add(result, result, Operand(dividend, LSR, 31)); | 
| 3895 } | 3968 } | 
| 3896 | 3969 | 
| 3897 } // namespace internal | 3970 } // namespace internal | 
| 3898 } // namespace v8 | 3971 } // namespace v8 | 
| 3899 | 3972 | 
| 3900 #endif // V8_TARGET_ARCH_ARM | 3973 #endif // V8_TARGET_ARCH_ARM | 
| OLD | NEW |