Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(31)

Unified Diff: src/arm/assembler-arm.cc

Issue 2546933002: [Turbofan] Add ARM NEON instructions for implementing SIMD. (Closed)
Patch Set: Review comments. Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/arm/assembler-arm.cc
diff --git a/src/arm/assembler-arm.cc b/src/arm/assembler-arm.cc
index 0a671c06244aefca6d2ca8997a8c35fd583281cd..da29480af353e769f8753d6c7a188c65d959a6ab 100644
--- a/src/arm/assembler-arm.cc
+++ b/src/arm/assembler-arm.cc
@@ -483,30 +483,6 @@ void NeonMemOperand::SetAlignment(int align) {
}
}
-
-NeonListOperand::NeonListOperand(DoubleRegister base, int registers_count) {
- base_ = base;
- switch (registers_count) {
- case 1:
- type_ = nlt_1;
- break;
- case 2:
- type_ = nlt_2;
- break;
- case 3:
- type_ = nlt_3;
- break;
- case 4:
- type_ = nlt_4;
- break;
- default:
- UNREACHABLE();
- type_ = nlt_1;
- break;
- }
-}
-
-
// -----------------------------------------------------------------------------
// Specific instructions, constants, and masks.
@@ -2968,7 +2944,6 @@ void Assembler::vmov(const Register dst,
emit(cond | 0xE*B24 | B20 | sn*B16 | dst.code()*B12 | 0xA*B8 | n*B7 | B4);
}
-
// Type of data to read from or write to VFP register.
// Used as specifier in generic vcvt instruction.
enum VFPType { S32, U32, F32, F64 };
@@ -3915,6 +3890,18 @@ void Assembler::vmov(const QwNeonRegister dst, const QwNeonRegister src) {
B6 | m * B5 | B4 | vm);
}
+void Assembler::vmvn(const QwNeonRegister dst, const QwNeonRegister src) {
+ DCHECK(IsEnabled(NEON));
+ // Instruction details available in ARM DDI 0406C.b, A8-966.
+ DCHECK(VfpRegisterIsAvailable(dst));
+ DCHECK(VfpRegisterIsAvailable(src));
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vm, m;
+ src.split_code(&vm, &m);
+ emit(0x1E7 * B23 | d * B22 | 3 * B20 | vd * B12 | 0x17 * B6 | m * B5 | vm);
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 0x1E7U according to issue 5725
bbudge 2016/12/10 21:33:03 Done.
+}
+
void Assembler::vswp(DwVfpRegister dst, DwVfpRegister src) {
// Instruction details available in ARM DDI 0406C.b, A8.8.418.
// 1111(31-28) | 00111(27-23) | D(22) | 110010(21-16) |
@@ -3940,8 +3927,105 @@ void Assembler::vswp(QwNeonRegister dst, QwNeonRegister src) {
vm);
}
+void Assembler::vdup(const QwNeonRegister dst, const Register src,
+ NeonSize size) {
+ DCHECK(IsEnabled(NEON));
+ // Instruction details available in ARM DDI 0406C.b, A8-886.
+ int B = 0, E = 0;
+ switch (size) {
+ case Neon8:
+ B = 1;
+ break;
+ case Neon16:
+ E = 1;
+ break;
+ case Neon32:
+ break;
+ default:
+ UNREACHABLE();
+ break;
+ }
+ int vd, d;
+ dst.split_code(&vd, &d);
+
+ emit(al | 0x1D * B23 | B * B22 | B21 | vd * B16 | src.code() * B12 |
+ 0xB * B8 | d * B7 | E * B5 | B4);
+}
+
+void Assembler::vdup(const QwNeonRegister dst, const SwVfpRegister src) {
+ DCHECK(IsEnabled(NEON));
+ // Instruction details available in ARM DDI 0406C.b, A8-884.
+ int index = src.code() & 1;
+ int d_reg = src.code() / 2;
+ int imm4 = 4 | index << 3; // esize = 32, index in bit 3.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vm, m;
+ DwVfpRegister::from_code(d_reg).split_code(&vm, &m);
+
+ emit(0x1E7 * B23 | d * B22 | 0x3 * B20 | imm4 * B16 | vd * B12 | 0x18 * B7 |
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 0x1E7U
bbudge 2016/12/10 21:33:03 Done.
+ B6 | m * B5 | vm);
+}
+
+// Encode NEON vcvt.src_type.dst_type instruction.
+static Instr EncodeNeonVCVT(const VFPType dst_type, const QwNeonRegister dst,
+ const VFPType src_type, const QwNeonRegister src) {
+ DCHECK(src_type != dst_type);
+ DCHECK(src_type == F32 || dst_type == F32);
+ DCHECK(src_type != F64 && dst_type != F64);
+ // Instruction details available in ARM DDI 0406C.b, A8.8.868.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vm, m;
+ src.split_code(&vm, &m);
+
+ int op = 0;
+ if (src_type == F32) {
+ op = dst_type == U32 ? 3 : 2;
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 DCECHK((dst_type == U32) || (dst_type == S32));
bbudge 2016/12/10 21:33:03 It's (subtly) implied by the existing DCHECKs abov
+ } else {
+ DCHECK_EQ(F32, dst_type);
+ op = src_type == U32 ? 1 : 0;
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 ditto with src_type
bbudge 2016/12/10 21:33:03 Done.
+ }
+
+ return 0x1E7u * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x3 * B9 | op * B7 |
+ B6 | m * B5 | vm;
+}
+
+void Assembler::vcvt_f32_s32(const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ DCHECK(IsEnabled(NEON));
+ DCHECK(VfpRegisterIsAvailable(dst));
+ DCHECK(VfpRegisterIsAvailable(src));
+ emit(EncodeNeonVCVT(F32, dst, S32, src));
+}
+
+void Assembler::vcvt_f32_u32(const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ DCHECK(IsEnabled(NEON));
+ DCHECK(VfpRegisterIsAvailable(dst));
+ DCHECK(VfpRegisterIsAvailable(src));
+ emit(EncodeNeonVCVT(F32, dst, U32, src));
+}
+
+void Assembler::vcvt_s32_f32(const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ DCHECK(IsEnabled(NEON));
+ DCHECK(VfpRegisterIsAvailable(dst));
+ DCHECK(VfpRegisterIsAvailable(src));
+ emit(EncodeNeonVCVT(S32, dst, F32, src));
+}
+
+void Assembler::vcvt_u32_f32(const QwNeonRegister dst,
+ const QwNeonRegister src) {
+ DCHECK(IsEnabled(NEON));
+ DCHECK(VfpRegisterIsAvailable(dst));
+ DCHECK(VfpRegisterIsAvailable(src));
+ emit(EncodeNeonVCVT(U32, dst, F32, src));
+}
+
void Assembler::veor(DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2) {
+ // Dd = veor(Dn, Dm) 64 bit integer exclusive OR.
// Instruction details available in ARM DDI 0406C.b, A8.8.888.
DCHECK(IsEnabled(NEON));
int vd, d;
@@ -3956,6 +4040,7 @@ void Assembler::veor(DwVfpRegister dst, DwVfpRegister src1,
void Assembler::veor(QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
+ // Qd = veor(Qn, Qm) SIMD integer exclusive OR.
// Instruction details available in ARM DDI 0406C.b, A8.8.888.
DCHECK(IsEnabled(NEON));
int vd, d;
@@ -3968,6 +4053,148 @@ void Assembler::veor(QwNeonRegister dst, QwNeonRegister src1,
B4 | vm);
}
+void Assembler::vadd(QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vadd(Qn, Qm) SIMD floating point addition.
+ // Instruction details available in ARM DDI 0406C.b, A8-830.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ src1.split_code(&vn, &n);
+ int vm, m;
+ src2.split_code(&vm, &m);
+ emit(0x1E4 * B23 | d * B22 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 | B6 |
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 0x1E4U
bbudge 2016/12/10 21:33:03 Done.
+ m * B5 | vm);
+}
+
+void Assembler::vadd(QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2, NeonSize size) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vadd(Qn, Qm) SIMD integer addition.
+ // Instruction details available in ARM DDI 0406C.b, A8-828.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ src1.split_code(&vn, &n);
+ int vm, m;
+ src2.split_code(&vm, &m);
+ int sz = static_cast<int>(size);
+ emit(0x1E4 * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 |
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 ditto
bbudge 2016/12/10 21:33:03 Done.
+ n * B7 | B6 | m * B5 | vm);
+}
+
+void Assembler::vsub(QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vsub(Qn, Qm) SIMD floating point subtraction.
+ // Instruction details available in ARM DDI 0406C.b, A8-1086.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ src1.split_code(&vn, &n);
+ int vm, m;
+ src2.split_code(&vm, &m);
+ emit(0x1E4 * B23 | d * B22 | B21 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 |
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 ditto
bbudge 2016/12/10 21:33:03 Done.
+ B6 | m * B5 | vm);
+}
+
+void Assembler::vsub(QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2, NeonSize size) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vsub(Qn, Qm) SIMD integer subtraction.
+ // Instruction details available in ARM DDI 0406C.b, A8-1084.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ src1.split_code(&vn, &n);
+ int vm, m;
+ src2.split_code(&vm, &m);
+ int sz = static_cast<int>(size);
+ emit(0x1E6 * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 |
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 ditto
bbudge 2016/12/10 21:33:03 Done.
+ n * B7 | B6 | m * B5 | vm);
+}
+
+void Assembler::vtst(QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2, NeonSize size) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vtst(Qn, Qm) SIMD test integer operands.
+ // Instruction details available in ARM DDI 0406C.b, A8-1098.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ src1.split_code(&vn, &n);
+ int vm, m;
+ src2.split_code(&vm, &m);
+ int sz = static_cast<int>(size);
+ emit(0x1E4 * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 |
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 ditto
bbudge 2016/12/10 21:33:03 Done.
+ n * B7 | B6 | m * B5 | B4 | vm);
+}
+
+void Assembler::vceq(QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2, NeonSize size) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vceq(Qn, Qm) SIMD integer compare equal.
+ // Instruction details available in ARM DDI 0406C.b, A8-844.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ src1.split_code(&vn, &n);
+ int vm, m;
+ src2.split_code(&vm, &m);
+ int sz = static_cast<int>(size);
+ emit(0x1E6 * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x8 * B8 |
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 ditto
bbudge 2016/12/10 21:33:04 Done.
+ n * B7 | B6 | m * B5 | B4 | vm);
+}
+
+void Assembler::vbsl(QwNeonRegister dst, const QwNeonRegister src1,
+ const QwNeonRegister src2) {
+ DCHECK(IsEnabled(NEON));
+ // Qd = vbsl(Qn, Qm) SIMD bitwise select.
+ // Instruction details available in ARM DDI 0406C.b, A8-844.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ src1.split_code(&vn, &n);
+ int vm, m;
+ src2.split_code(&vm, &m);
+ int op = 1; // vbsl
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 is that necessary? why not using B20 directly belo
bbudge 2016/12/10 21:33:04 Done.
+ emit(0x1E6 * B23 | d * B22 | op * B20 | vn * B16 | vd * B12 | 0x1 * B8 |
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 0x1E6U
bbudge 2016/12/10 21:33:03 Done.
+ n * B7 | B6 | m * B5 | B4 | vm);
+}
+
+void Assembler::vtbl(const DwVfpRegister dst, const NeonListOperand& list,
+ const DwVfpRegister index) {
+ DCHECK(IsEnabled(NEON));
+ // Dd = vtbl(table, Dm) SIMD vector permute, zero at out of range indices.
+ // Instruction details available in ARM DDI 0406C.b, A8-1094.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ list.base().split_code(&vn, &n);
+ int vm, m;
+ index.split_code(&vm, &m);
+ int op = 1; // vbsl
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 unused.
bbudge 2016/12/10 21:33:04 Done.
+ emit(0x1E7 * B23 | d * B22 | 0x3 * B20 | vn * B16 | vd * B12 | 0x2 * B10 |
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 0x1E7U
bbudge 2016/12/10 21:33:03 Done.
+ list.len() * B8 | n * B7 | m * B5 | vm);
+}
+
+void Assembler::vtbx(const DwVfpRegister dst, const NeonListOperand& list,
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 vtbl and vtbx encoding only differ by the value of
bbudge 2016/12/10 21:33:03 Yes, Done.
+ const DwVfpRegister index) {
+ DCHECK(IsEnabled(NEON));
+ // Dd = vtbx(table, Dm) SIMD vector permute, skip out of range indices.
+ // Instruction details available in ARM DDI 0406C.b, A8-1094.
+ int vd, d;
+ dst.split_code(&vd, &d);
+ int vn, n;
+ list.base().split_code(&vn, &n);
+ int vm, m;
+ index.split_code(&vm, &m);
+ int op = 1; // vbsl
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 unused
bbudge 2016/12/10 21:33:03 Done.
+ emit(0x1E7 * B23 | d * B22 | 0x3 * B20 | vn * B16 | vd * B12 | 0x2 * B10 |
Rodolph Perfetta (ARM) 2016/12/08 18:08:27 0x1E7U
bbudge 2016/12/10 21:33:04 Done.
+ list.len() * B8 | n * B7 | B6 | m * B5 | vm);
+}
+
// Pseudo instructions.
void Assembler::nop(int type) {
// ARMv6{K/T2} and v7 have an actual NOP instruction but it serializes

Powered by Google App Engine
This is Rietveld 408576698