Index: src/arm/simulator-arm.cc |
diff --git a/src/arm/simulator-arm.cc b/src/arm/simulator-arm.cc |
index a2f3536ad3d4b0114e97b3807dfdf0e44787507f..2bf2a6a3a52b549a548bc384081e975344b63a69 100644 |
--- a/src/arm/simulator-arm.cc |
+++ b/src/arm/simulator-arm.cc |
@@ -896,20 +896,28 @@ |
memcpy(vfp_registers_ + dreg * 2, value, sizeof(*value) * 2); |
} |
-template <typename T, int SIZE> |
-void Simulator::get_neon_register(int reg, T (&value)[SIZE / sizeof(T)]) { |
- DCHECK(SIZE == kSimd128Size || SIZE == kDoubleSize); |
- DCHECK_LE(0, reg); |
- DCHECK_GT(SIZE == kSimd128Size ? num_q_registers : num_d_registers, reg); |
- memcpy(value, vfp_registers_ + reg * (SIZE / 4), SIZE); |
-} |
- |
-template <typename T, int SIZE> |
-void Simulator::set_neon_register(int reg, const T (&value)[SIZE / sizeof(T)]) { |
- DCHECK(SIZE == kSimd128Size || SIZE == kDoubleSize); |
- DCHECK_LE(0, reg); |
- DCHECK_GT(SIZE == kSimd128Size ? num_q_registers : num_d_registers, reg); |
- memcpy(vfp_registers_ + reg * (SIZE / 4), value, SIZE); |
+template <typename T> |
+void Simulator::get_d_register(int dreg, T* value) { |
+ DCHECK((dreg >= 0) && (dreg < num_d_registers)); |
+ memcpy(value, vfp_registers_ + dreg * 2, kDoubleSize); |
+} |
+ |
+template <typename T> |
+void Simulator::set_d_register(int dreg, const T* value) { |
+ DCHECK((dreg >= 0) && (dreg < num_d_registers)); |
+ memcpy(vfp_registers_ + dreg * 2, value, kDoubleSize); |
+} |
+ |
+template <typename T> |
+void Simulator::get_q_register(int qreg, T* value) { |
+ DCHECK((qreg >= 0) && (qreg < num_q_registers)); |
+ memcpy(value, vfp_registers_ + qreg * 4, kSimd128Size); |
+} |
+ |
+template <typename T> |
+void Simulator::set_q_register(int qreg, const T* value) { |
+ DCHECK((qreg >= 0) && (qreg < num_q_registers)); |
+ memcpy(vfp_registers_ + qreg * 4, value, kSimd128Size); |
} |
// Raw access to the PC register. |
@@ -3500,7 +3508,7 @@ |
UNREACHABLE(); |
break; |
} |
- set_neon_register(vd, q_data); |
+ set_q_register(vd, q_data); |
} |
} else if ((instr->VLValue() == 0x1) && (instr->VCValue() == 0x1)) { |
// vmov (scalar to ARM core register) |
@@ -3987,6 +3995,7 @@ |
} |
// Templated operations for NEON instructions. |
+// TODO(bbudge) Add more templates for use in DecodeSpecialCondition. |
template <typename T, typename U> |
U Widen(T value) { |
static_assert(sizeof(int64_t) > sizeof(T), "T must be int32_t or smaller"); |
@@ -4016,38 +4025,21 @@ |
return static_cast<T>(clamped); |
} |
+template <typename T> |
+T MinMax(T a, T b, bool is_min) { |
+ return is_min ? std::min(a, b) : std::max(a, b); |
+} |
+ |
template <typename T, typename U> |
void Widen(Simulator* simulator, int Vd, int Vm) { |
static const int kLanes = 8 / sizeof(T); |
T src[kLanes]; |
U dst[kLanes]; |
- simulator->get_neon_register<T, kDoubleSize>(Vm, src); |
+ simulator->get_d_register(Vm, src); |
for (int i = 0; i < kLanes; i++) { |
dst[i] = Widen<T, U>(src[i]); |
} |
- simulator->set_neon_register(Vd, dst); |
-} |
- |
-template <typename T, int SIZE> |
-void Abs(Simulator* simulator, int Vd, int Vm) { |
- static const int kElems = SIZE / sizeof(T); |
- T src[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vm, src); |
- for (int i = 0; i < kElems; i++) { |
- src[i] = std::abs(src[i]); |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src); |
-} |
- |
-template <typename T, int SIZE> |
-void Neg(Simulator* simulator, int Vd, int Vm) { |
- static const int kElems = SIZE / sizeof(T); |
- T src[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vm, src); |
- for (int i = 0; i < kElems; i++) { |
- src[i] = -src[i]; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src); |
+ simulator->set_q_register(Vd, dst); |
} |
template <typename T, typename U> |
@@ -4055,222 +4047,35 @@ |
static const int kLanes = 16 / sizeof(T); |
T src[kLanes]; |
U dst[kLanes]; |
- simulator->get_neon_register(Vm, src); |
+ simulator->get_q_register(Vm, src); |
for (int i = 0; i < kLanes; i++) { |
dst[i] = Narrow<T, U>(Clamp<U>(src[i])); |
} |
- simulator->set_neon_register<U, kDoubleSize>(Vd, dst); |
+ simulator->set_d_register(Vd, dst); |
} |
template <typename T> |
void AddSaturate(Simulator* simulator, int Vd, int Vm, int Vn) { |
static const int kLanes = 16 / sizeof(T); |
T src1[kLanes], src2[kLanes]; |
- simulator->get_neon_register(Vn, src1); |
- simulator->get_neon_register(Vm, src2); |
+ simulator->get_q_register(Vn, src1); |
+ simulator->get_q_register(Vm, src2); |
for (int i = 0; i < kLanes; i++) { |
src1[i] = Clamp<T>(Widen<T, int64_t>(src1[i]) + Widen<T, int64_t>(src2[i])); |
} |
- simulator->set_neon_register(Vd, src1); |
+ simulator->set_q_register(Vd, src1); |
} |
template <typename T> |
void SubSaturate(Simulator* simulator, int Vd, int Vm, int Vn) { |
static const int kLanes = 16 / sizeof(T); |
T src1[kLanes], src2[kLanes]; |
- simulator->get_neon_register(Vn, src1); |
- simulator->get_neon_register(Vm, src2); |
+ simulator->get_q_register(Vn, src1); |
+ simulator->get_q_register(Vm, src2); |
for (int i = 0; i < kLanes; i++) { |
src1[i] = Clamp<T>(Widen<T, int64_t>(src1[i]) - Widen<T, int64_t>(src2[i])); |
} |
- simulator->set_neon_register(Vd, src1); |
-} |
- |
-template <typename T, int SIZE> |
-void Zip(Simulator* simulator, int Vd, int Vm) { |
- static const int kElems = SIZE / sizeof(T); |
- static const int kPairs = kElems / 2; |
- T src1[kElems], src2[kElems], dst1[kElems], dst2[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vd, src1); |
- simulator->get_neon_register<T, SIZE>(Vm, src2); |
- for (int i = 0; i < kPairs; i++) { |
- dst1[i * 2] = src1[i]; |
- dst1[i * 2 + 1] = src2[i]; |
- dst2[i * 2] = src1[i + kPairs]; |
- dst2[i * 2 + 1] = src2[i + kPairs]; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, dst1); |
- simulator->set_neon_register<T, SIZE>(Vm, dst2); |
-} |
- |
-template <typename T, int SIZE> |
-void Unzip(Simulator* simulator, int Vd, int Vm) { |
- static const int kElems = SIZE / sizeof(T); |
- static const int kPairs = kElems / 2; |
- T src1[kElems], src2[kElems], dst1[kElems], dst2[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vd, src1); |
- simulator->get_neon_register<T, SIZE>(Vm, src2); |
- for (int i = 0; i < kPairs; i++) { |
- dst1[i] = src1[i * 2]; |
- dst1[i + kPairs] = src2[i * 2]; |
- dst2[i] = src1[i * 2 + 1]; |
- dst2[i + kPairs] = src2[i * 2 + 1]; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, dst1); |
- simulator->set_neon_register<T, SIZE>(Vm, dst2); |
-} |
- |
-template <typename T, int SIZE> |
-void Transpose(Simulator* simulator, int Vd, int Vm) { |
- static const int kElems = SIZE / sizeof(T); |
- static const int kPairs = kElems / 2; |
- T src1[kElems], src2[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vd, src1); |
- simulator->get_neon_register<T, SIZE>(Vm, src2); |
- for (int i = 0; i < kPairs; i++) { |
- std::swap(src1[2 * i + 1], src2[2 * i]); |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src1); |
- simulator->set_neon_register<T, SIZE>(Vm, src2); |
-} |
- |
-template <typename T, int SIZE> |
-void Test(Simulator* simulator, int Vd, int Vm, int Vn) { |
- static const int kElems = SIZE / sizeof(T); |
- T src1[kElems], src2[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vn, src1); |
- simulator->get_neon_register<T, SIZE>(Vm, src2); |
- for (int i = 0; i < kElems; i++) { |
- src1[i] = (src1[i] & src2[i]) != 0 ? -1 : 0; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src1); |
-} |
- |
-template <typename T, int SIZE> |
-void Add(Simulator* simulator, int Vd, int Vm, int Vn) { |
- static const int kElems = SIZE / sizeof(T); |
- T src1[kElems], src2[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vn, src1); |
- simulator->get_neon_register<T, SIZE>(Vm, src2); |
- for (int i = 0; i < kElems; i++) { |
- src1[i] += src2[i]; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src1); |
-} |
- |
-template <typename T, int SIZE> |
-void Sub(Simulator* simulator, int Vd, int Vm, int Vn) { |
- static const int kElems = SIZE / sizeof(T); |
- T src1[kElems], src2[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vn, src1); |
- simulator->get_neon_register<T, SIZE>(Vm, src2); |
- for (int i = 0; i < kElems; i++) { |
- src1[i] -= src2[i]; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src1); |
-} |
- |
-template <typename T, int SIZE> |
-void Mul(Simulator* simulator, int Vd, int Vm, int Vn) { |
- static const int kElems = SIZE / sizeof(T); |
- T src1[kElems], src2[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vn, src1); |
- simulator->get_neon_register<T, SIZE>(Vm, src2); |
- for (int i = 0; i < kElems; i++) { |
- src1[i] *= src2[i]; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src1); |
-} |
- |
-template <typename T, int SIZE> |
-void ShiftLeft(Simulator* simulator, int Vd, int Vm, int shift) { |
- static const int kElems = SIZE / sizeof(T); |
- T src[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vm, src); |
- for (int i = 0; i < kElems; i++) { |
- src[i] <<= shift; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src); |
-} |
- |
-template <typename T, int SIZE> |
-void ShiftRight(Simulator* simulator, int Vd, int Vm, int shift) { |
- static const int kElems = SIZE / sizeof(T); |
- T src[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vm, src); |
- for (int i = 0; i < kElems; i++) { |
- src[i] >>= shift; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src); |
-} |
- |
-template <typename T, int SIZE> |
-void ArithmeticShiftRight(Simulator* simulator, int Vd, int Vm, int shift) { |
- static const int kElems = SIZE / sizeof(T); |
- T src[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vm, src); |
- for (int i = 0; i < kElems; i++) { |
- src[i] = ArithmeticShiftRight(src[i], shift); |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src); |
-} |
- |
-template <typename T, int SIZE> |
-void CompareEqual(Simulator* simulator, int Vd, int Vm, int Vn) { |
- static const int kElems = SIZE / sizeof(T); |
- T src1[kElems], src2[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vn, src1); |
- simulator->get_neon_register<T, SIZE>(Vm, src2); |
- for (int i = 0; i < kElems; i++) { |
- src1[i] = src1[i] == src2[i] ? -1 : 0; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src1); |
-} |
- |
-template <typename T, int SIZE> |
-void CompareGreater(Simulator* simulator, int Vd, int Vm, int Vn, bool ge) { |
- static const int kElems = SIZE / sizeof(T); |
- T src1[kElems], src2[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vn, src1); |
- simulator->get_neon_register<T, SIZE>(Vm, src2); |
- for (int i = 0; i < kElems; i++) { |
- if (ge) |
- src1[i] = src1[i] >= src2[i] ? -1 : 0; |
- else |
- src1[i] = src1[i] > src2[i] ? -1 : 0; |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src1); |
-} |
- |
-template <typename T> |
-T MinMax(T a, T b, bool is_min) { |
- return is_min ? std::min(a, b) : std::max(a, b); |
-} |
- |
-template <typename T, int SIZE> |
-void MinMax(Simulator* simulator, int Vd, int Vm, int Vn, bool min) { |
- static const int kElems = SIZE / sizeof(T); |
- T src1[kElems], src2[kElems]; |
- simulator->get_neon_register<T, SIZE>(Vn, src1); |
- simulator->get_neon_register<T, SIZE>(Vm, src2); |
- for (int i = 0; i < kElems; i++) { |
- src1[i] = MinMax(src1[i], src2[i], min); |
- } |
- simulator->set_neon_register<T, SIZE>(Vd, src1); |
-} |
- |
-template <typename T> |
-void PairwiseMinMax(Simulator* simulator, int Vd, int Vm, int Vn, bool min) { |
- static const int kElems = kDoubleSize / sizeof(T); |
- static const int kPairs = kElems / 2; |
- T dst[kElems], src1[kElems], src2[kElems]; |
- simulator->get_neon_register<T, kDoubleSize>(Vn, src1); |
- simulator->get_neon_register<T, kDoubleSize>(Vm, src2); |
- for (int i = 0; i < kPairs; i++) { |
- dst[i] = MinMax(src1[i * 2], src1[i * 2 + 1], min); |
- dst[i + kPairs] = MinMax(src2[i * 2], src2[i * 2 + 1], min); |
- } |
- simulator->set_neon_register<T, kDoubleSize>(Vd, dst); |
+ simulator->set_q_register(Vd, src1); |
} |
void Simulator::DecodeSpecialCondition(Instruction* instr) { |
@@ -4316,25 +4121,25 @@ |
// vmov Qd, Qm. |
// vorr, Qd, Qm, Qn. |
uint32_t src1[4]; |
- get_neon_register(Vm, src1); |
+ get_q_register(Vm, src1); |
if (Vm != Vn) { |
uint32_t src2[4]; |
- get_neon_register(Vn, src2); |
+ get_q_register(Vn, src2); |
for (int i = 0; i < 4; i++) { |
src1[i] = src1[i] | src2[i]; |
} |
} |
- set_neon_register(Vd, src1); |
+ set_q_register(Vd, src1); |
} else if (instr->Bits(21, 20) == 0 && instr->Bit(6) == 1 && |
instr->Bit(4) == 1) { |
// vand Qd, Qm, Qn. |
uint32_t src1[4], src2[4]; |
- get_neon_register(Vn, src1); |
- get_neon_register(Vm, src2); |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
for (int i = 0; i < 4; i++) { |
src1[i] = src1[i] & src2[i]; |
} |
- set_neon_register(Vd, src1); |
+ set_q_register(Vd, src1); |
} else { |
UNIMPLEMENTED(); |
} |
@@ -4368,15 +4173,45 @@ |
bool ge = instr->Bit(4) == 1; |
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20)); |
switch (size) { |
- case Neon8: |
- CompareGreater<int8_t, kSimd128Size>(this, Vd, Vm, Vn, ge); |
+ case Neon8: { |
+ int8_t src1[16], src2[16]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 16; i++) { |
+ if (ge) |
+ src1[i] = src1[i] >= src2[i] ? 0xFF : 0; |
+ else |
+ src1[i] = src1[i] > src2[i] ? 0xFF : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon16: |
- CompareGreater<int16_t, kSimd128Size>(this, Vd, Vm, Vn, ge); |
+ } |
+ case Neon16: { |
+ int16_t src1[8], src2[8]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ if (ge) |
+ src1[i] = src1[i] >= src2[i] ? 0xFFFF : 0; |
+ else |
+ src1[i] = src1[i] > src2[i] ? 0xFFFF : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon32: |
- CompareGreater<int32_t, kSimd128Size>(this, Vd, Vm, Vn, ge); |
+ } |
+ case Neon32: { |
+ int32_t src1[4], src2[4]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ if (ge) |
+ src1[i] = src1[i] >= src2[i] ? 0xFFFFFFFF : 0; |
+ else |
+ src1[i] = src1[i] > src2[i] ? 0xFFFFFFFF : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4388,15 +4223,36 @@ |
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20)); |
bool min = instr->Bit(4) != 0; |
switch (size) { |
- case Neon8: |
- MinMax<int8_t, kSimd128Size>(this, Vd, Vm, Vn, min); |
+ case Neon8: { |
+ int8_t src1[16], src2[16]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 16; i++) { |
+ src1[i] = MinMax(src1[i], src2[i], min); |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon16: |
- MinMax<int16_t, kSimd128Size>(this, Vd, Vm, Vn, min); |
+ } |
+ case Neon16: { |
+ int16_t src1[8], src2[8]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ src1[i] = MinMax(src1[i], src2[i], min); |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon32: |
- MinMax<int32_t, kSimd128Size>(this, Vd, Vm, Vn, min); |
+ } |
+ case Neon32: { |
+ int32_t src1[4], src2[4]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ src1[i] = MinMax(src1[i], src2[i], min); |
+ } |
+ set_q_register(Vd, src1); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4409,15 +4265,36 @@ |
if (instr->Bit(4) == 0) { |
// vadd.i<size> Qd, Qm, Qn. |
switch (size) { |
- case Neon8: |
- Add<uint8_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ case Neon8: { |
+ uint8_t src1[16], src2[16]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 16; i++) { |
+ src1[i] += src2[i]; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon16: |
- Add<uint16_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ } |
+ case Neon16: { |
+ uint16_t src1[8], src2[8]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ src1[i] += src2[i]; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon32: |
- Add<uint32_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ } |
+ case Neon32: { |
+ uint32_t src1[4], src2[4]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ src1[i] += src2[i]; |
+ } |
+ set_q_register(Vd, src1); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4425,15 +4302,36 @@ |
} else { |
// vtst.i<size> Qd, Qm, Qn. |
switch (size) { |
- case Neon8: |
- Test<uint8_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ case Neon8: { |
+ uint8_t src1[16], src2[16]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 16; i++) { |
+ src1[i] = (src1[i] & src2[i]) != 0 ? 0xFFu : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon16: |
- Test<uint16_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ } |
+ case Neon16: { |
+ uint16_t src1[8], src2[8]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ src1[i] = (src1[i] & src2[i]) != 0 ? 0xFFFFu : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon32: |
- Test<uint32_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ } |
+ case Neon32: { |
+ uint32_t src1[4], src2[4]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ src1[i] = (src1[i] & src2[i]) != 0 ? 0xFFFFFFFFu : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4446,15 +4344,36 @@ |
// vmul.i<size> Qd, Qm, Qn. |
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20)); |
switch (size) { |
- case Neon8: |
- Mul<uint8_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ case Neon8: { |
+ uint8_t src1[16], src2[16]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 16; i++) { |
+ src1[i] *= src2[i]; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon16: |
- Mul<uint16_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ } |
+ case Neon16: { |
+ uint16_t src1[8], src2[8]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ src1[i] *= src2[i]; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon32: |
- Mul<uint32_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ } |
+ case Neon32: { |
+ uint32_t src1[4], src2[4]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ src1[i] *= src2[i]; |
+ } |
+ set_q_register(Vd, src1); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4469,15 +4388,37 @@ |
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20)); |
bool min = instr->Bit(4) != 0; |
switch (size) { |
- case Neon8: |
- PairwiseMinMax<int8_t>(this, Vd, Vm, Vn, min); |
+ case Neon8: { |
+ int8_t dst[8], src1[8], src2[8]; |
+ get_d_register(Vn, src1); |
+ get_d_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ dst[i + 0] = MinMax(src1[i * 2], src1[i * 2 + 1], min); |
+ dst[i + 4] = MinMax(src2[i * 2], src2[i * 2 + 1], min); |
+ } |
+ set_d_register(Vd, dst); |
break; |
- case Neon16: |
- PairwiseMinMax<int16_t>(this, Vd, Vm, Vn, min); |
+ } |
+ case Neon16: { |
+ int16_t dst[4], src1[4], src2[4]; |
+ get_d_register(Vn, src1); |
+ get_d_register(Vm, src2); |
+ for (int i = 0; i < 2; i++) { |
+ dst[i + 0] = MinMax(src1[i * 2], src1[i * 2 + 1], min); |
+ dst[i + 2] = MinMax(src2[i * 2], src2[i * 2 + 1], min); |
+ } |
+ set_d_register(Vd, dst); |
break; |
- case Neon32: |
- PairwiseMinMax<int32_t>(this, Vd, Vm, Vn, min); |
+ } |
+ case Neon32: { |
+ int32_t dst[2], src1[2], src2[2]; |
+ get_d_register(Vn, src1); |
+ get_d_register(Vm, src2); |
+ dst[0] = MinMax(src1[0], src1[1], min); |
+ dst[1] = MinMax(src2[0], src2[1], min); |
+ set_d_register(Vd, dst); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4487,8 +4428,8 @@ |
case 0xd: { |
if (instr->Bit(4) == 0) { |
float src1[4], src2[4]; |
- get_neon_register(Vn, src1); |
- get_neon_register(Vm, src2); |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
for (int i = 0; i < 4; i++) { |
if (instr->Bit(21) == 0) { |
// vadd.f32 Qd, Qm, Qn. |
@@ -4498,7 +4439,7 @@ |
src1[i] = src1[i] - src2[i]; |
} |
} |
- set_neon_register(Vd, src1); |
+ set_q_register(Vd, src1); |
} else { |
UNIMPLEMENTED(); |
} |
@@ -4508,13 +4449,13 @@ |
if (instr->Bits(21, 20) == 0 && instr->Bit(4) == 0) { |
// vceq.f32. |
float src1[4], src2[4]; |
- get_neon_register(Vn, src1); |
- get_neon_register(Vm, src2); |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
uint32_t dst[4]; |
for (int i = 0; i < 4; i++) { |
dst[i] = (src1[i] == src2[i]) ? 0xFFFFFFFF : 0; |
} |
- set_neon_register(Vd, dst); |
+ set_q_register(Vd, dst); |
} else { |
UNIMPLEMENTED(); |
} |
@@ -4523,8 +4464,8 @@ |
case 0xf: { |
if (instr->Bit(20) == 0 && instr->Bit(6) == 1) { |
float src1[4], src2[4]; |
- get_neon_register(Vn, src1); |
- get_neon_register(Vm, src2); |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
if (instr->Bit(4) == 1) { |
if (instr->Bit(21) == 0) { |
// vrecps.f32 Qd, Qm, Qn. |
@@ -4544,7 +4485,7 @@ |
src1[i] = MinMax(src1[i], src2[i], min); |
} |
} |
- set_neon_register(Vd, src1); |
+ set_q_register(Vd, src1); |
} else { |
UNIMPLEMENTED(); |
} |
@@ -4585,8 +4526,8 @@ |
int Vm = instr->VFPMRegValue(kSimd128Precision); |
int Vn = instr->VFPNRegValue(kSimd128Precision); |
uint8_t src1[16], src2[16], dst[16]; |
- get_neon_register(Vn, src1); |
- get_neon_register(Vm, src2); |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
int boundary = kSimd128Size - imm4; |
int i = 0; |
for (; i < boundary; i++) { |
@@ -4595,7 +4536,7 @@ |
for (; i < 16; i++) { |
dst[i] = src2[i - boundary]; |
} |
- set_neon_register(Vd, dst); |
+ set_q_register(Vd, dst); |
} else if (instr->Bits(11, 7) == 0xA && instr->Bit(4) == 1) { |
// vshl.i<size> Qd, Qm, shift |
int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16)); |
@@ -4604,15 +4545,33 @@ |
int Vm = instr->VFPMRegValue(kSimd128Precision); |
NeonSize ns = static_cast<NeonSize>(size / 16); |
switch (ns) { |
- case Neon8: |
- ShiftLeft<uint8_t, kSimd128Size>(this, Vd, Vm, shift); |
+ case Neon8: { |
+ uint8_t src[16]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 16; i++) { |
+ src[i] <<= shift; |
+ } |
+ set_q_register(Vd, src); |
break; |
- case Neon16: |
- ShiftLeft<uint16_t, kSimd128Size>(this, Vd, Vm, shift); |
+ } |
+ case Neon16: { |
+ uint16_t src[8]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 8; i++) { |
+ src[i] <<= shift; |
+ } |
+ set_q_register(Vd, src); |
break; |
- case Neon32: |
- ShiftLeft<uint32_t, kSimd128Size>(this, Vd, Vm, shift); |
+ } |
+ case Neon32: { |
+ uint32_t src[4]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 4; i++) { |
+ src[i] <<= shift; |
+ } |
+ set_q_register(Vd, src); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4625,15 +4584,33 @@ |
int Vm = instr->VFPMRegValue(kSimd128Precision); |
NeonSize ns = static_cast<NeonSize>(size / 16); |
switch (ns) { |
- case Neon8: |
- ArithmeticShiftRight<int8_t, kSimd128Size>(this, Vd, Vm, shift); |
+ case Neon8: { |
+ int8_t src[16]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 16; i++) { |
+ src[i] = ArithmeticShiftRight(src[i], shift); |
+ } |
+ set_q_register(Vd, src); |
break; |
- case Neon16: |
- ArithmeticShiftRight<int16_t, kSimd128Size>(this, Vd, Vm, shift); |
+ } |
+ case Neon16: { |
+ int16_t src[8]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 8; i++) { |
+ src[i] = ArithmeticShiftRight(src[i], shift); |
+ } |
+ set_q_register(Vd, src); |
break; |
- case Neon32: |
- ArithmeticShiftRight<int32_t, kSimd128Size>(this, Vd, Vm, shift); |
+ } |
+ case Neon32: { |
+ int32_t src[4]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 4; i++) { |
+ src[i] = ArithmeticShiftRight(src[i], shift); |
+ } |
+ set_q_register(Vd, src); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4681,13 +4658,13 @@ |
if (instr->Bits(21, 20) == 1 && instr->Bit(4) == 1) { |
// vbsl.size Qd, Qm, Qn. |
uint32_t dst[4], src1[4], src2[4]; |
- get_neon_register(Vd, dst); |
- get_neon_register(Vn, src1); |
- get_neon_register(Vm, src2); |
+ get_q_register(Vd, dst); |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
for (int i = 0; i < 4; i++) { |
dst[i] = (dst[i] & src1[i]) | (~dst[i] & src2[i]); |
} |
- set_neon_register(Vd, dst); |
+ set_q_register(Vd, dst); |
} else if (instr->Bits(21, 20) == 0 && instr->Bit(4) == 1) { |
if (instr->Bit(6) == 0) { |
// veor Dd, Dn, Dm |
@@ -4700,10 +4677,10 @@ |
} else { |
// veor Qd, Qn, Qm |
uint32_t src1[4], src2[4]; |
- get_neon_register(Vn, src1); |
- get_neon_register(Vm, src2); |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
for (int i = 0; i < 4; i++) src1[i] ^= src2[i]; |
- set_neon_register(Vd, src1); |
+ set_q_register(Vd, src1); |
} |
} else { |
UNIMPLEMENTED(); |
@@ -4738,15 +4715,45 @@ |
bool ge = instr->Bit(4) == 1; |
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20)); |
switch (size) { |
- case Neon8: |
- CompareGreater<uint8_t, kSimd128Size>(this, Vd, Vm, Vn, ge); |
+ case Neon8: { |
+ uint8_t src1[16], src2[16]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 16; i++) { |
+ if (ge) |
+ src1[i] = src1[i] >= src2[i] ? 0xFFu : 0; |
+ else |
+ src1[i] = src1[i] > src2[i] ? 0xFFu : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon16: |
- CompareGreater<uint16_t, kSimd128Size>(this, Vd, Vm, Vn, ge); |
+ } |
+ case Neon16: { |
+ uint16_t src1[8], src2[8]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ if (ge) |
+ src1[i] = src1[i] >= src2[i] ? 0xFFFFu : 0; |
+ else |
+ src1[i] = src1[i] > src2[i] ? 0xFFFFu : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon32: |
- CompareGreater<uint32_t, kSimd128Size>(this, Vd, Vm, Vn, ge); |
+ } |
+ case Neon32: { |
+ uint32_t src1[4], src2[4]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ if (ge) |
+ src1[i] = src1[i] >= src2[i] ? 0xFFFFFFFFu : 0; |
+ else |
+ src1[i] = src1[i] > src2[i] ? 0xFFFFFFFFu : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4758,15 +4765,36 @@ |
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20)); |
bool min = instr->Bit(4) != 0; |
switch (size) { |
- case Neon8: |
- MinMax<uint8_t, kSimd128Size>(this, Vd, Vm, Vn, min); |
+ case Neon8: { |
+ uint8_t src1[16], src2[16]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 16; i++) { |
+ src1[i] = MinMax(src1[i], src2[i], min); |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon16: |
- MinMax<uint16_t, kSimd128Size>(this, Vd, Vm, Vn, min); |
+ } |
+ case Neon16: { |
+ uint16_t src1[8], src2[8]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ src1[i] = MinMax(src1[i], src2[i], min); |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon32: |
- MinMax<uint32_t, kSimd128Size>(this, Vd, Vm, Vn, min); |
+ } |
+ case Neon32: { |
+ uint32_t src1[4], src2[4]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ src1[i] = MinMax(src1[i], src2[i], min); |
+ } |
+ set_q_register(Vd, src1); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4778,15 +4806,36 @@ |
// vsub.size Qd, Qm, Qn. |
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20)); |
switch (size) { |
- case Neon8: |
- Sub<uint8_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ case Neon8: { |
+ uint8_t src1[16], src2[16]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 16; i++) { |
+ src1[i] -= src2[i]; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon16: |
- Sub<uint16_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ } |
+ case Neon16: { |
+ uint16_t src1[8], src2[8]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ src1[i] -= src2[i]; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon32: |
- Sub<uint32_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ } |
+ case Neon32: { |
+ uint32_t src1[4], src2[4]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ src1[i] -= src2[i]; |
+ } |
+ set_q_register(Vd, src1); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4795,15 +4844,36 @@ |
// vceq.size Qd, Qm, Qn. |
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20)); |
switch (size) { |
- case Neon8: |
- CompareEqual<uint8_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ case Neon8: { |
+ uint8_t src1[16], src2[16]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 16; i++) { |
+ src1[i] = (src1[i] == src2[i]) ? 0xFFu : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon16: |
- CompareEqual<uint16_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ } |
+ case Neon16: { |
+ uint16_t src1[8], src2[8]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ src1[i] = (src1[i] == src2[i]) ? 0xFFFFu : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
- case Neon32: |
- CompareEqual<uint32_t, kSimd128Size>(this, Vd, Vm, Vn); |
+ } |
+ case Neon32: { |
+ uint32_t src1[4], src2[4]; |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ src1[i] = (src1[i] == src2[i]) ? 0xFFFFFFFFu : 0; |
+ } |
+ set_q_register(Vd, src1); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4816,15 +4886,37 @@ |
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20)); |
bool min = instr->Bit(4) != 0; |
switch (size) { |
- case Neon8: |
- PairwiseMinMax<uint8_t>(this, Vd, Vm, Vn, min); |
+ case Neon8: { |
+ uint8_t dst[8], src1[8], src2[8]; |
+ get_d_register(Vn, src1); |
+ get_d_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ dst[i + 0] = MinMax(src1[i * 2], src1[i * 2 + 1], min); |
+ dst[i + 4] = MinMax(src2[i * 2], src2[i * 2 + 1], min); |
+ } |
+ set_d_register(Vd, dst); |
break; |
- case Neon16: |
- PairwiseMinMax<uint16_t>(this, Vd, Vm, Vn, min); |
+ } |
+ case Neon16: { |
+ uint16_t dst[4], src1[4], src2[4]; |
+ get_d_register(Vn, src1); |
+ get_d_register(Vm, src2); |
+ for (int i = 0; i < 2; i++) { |
+ dst[i + 0] = MinMax(src1[i * 2], src1[i * 2 + 1], min); |
+ dst[i + 2] = MinMax(src2[i * 2], src2[i * 2 + 1], min); |
+ } |
+ set_d_register(Vd, dst); |
break; |
- case Neon32: |
- PairwiseMinMax<uint32_t>(this, Vd, Vm, Vn, min); |
+ } |
+ case Neon32: { |
+ uint32_t dst[2], src1[2], src2[2]; |
+ get_d_register(Vn, src1); |
+ get_d_register(Vm, src2); |
+ dst[0] = MinMax(src1[0], src1[1], min); |
+ dst[1] = MinMax(src2[0], src2[1], min); |
+ set_d_register(Vd, dst); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |
@@ -4835,12 +4927,12 @@ |
if (instr->Bit(21) == 0 && instr->Bit(6) == 1 && instr->Bit(4) == 1) { |
// vmul.f32 Qd, Qn, Qm |
float src1[4], src2[4]; |
- get_neon_register(Vn, src1); |
- get_neon_register(Vm, src2); |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
for (int i = 0; i < 4; i++) { |
src1[i] = src1[i] * src2[i]; |
} |
- set_neon_register(Vd, src1); |
+ set_q_register(Vd, src1); |
} else { |
UNIMPLEMENTED(); |
} |
@@ -4851,8 +4943,8 @@ |
// vcge/vcgt.f32 Qd, Qm, Qn |
bool ge = instr->Bit(21) == 0; |
float src1[4], src2[4]; |
- get_neon_register(Vn, src1); |
- get_neon_register(Vm, src2); |
+ get_q_register(Vn, src1); |
+ get_q_register(Vm, src2); |
uint32_t dst[4]; |
for (int i = 0; i < 4; i++) { |
if (ge) { |
@@ -4861,7 +4953,7 @@ |
dst[i] = src1[i] > src2[i] ? 0xFFFFFFFFu : 0; |
} |
} |
- set_neon_register(Vd, dst); |
+ set_q_register(Vd, dst); |
} else { |
UNIMPLEMENTED(); |
} |
@@ -4902,7 +4994,7 @@ |
int Vd = instr->VFPDRegValue(kSimd128Precision); |
int Vm = instr->VFPMRegValue(kSimd128Precision); |
uint32_t q_data[4]; |
- get_neon_register(Vm, q_data); |
+ get_q_register(Vm, q_data); |
int op = instr->Bits(8, 7); |
for (int i = 0; i < 4; i++) { |
switch (op) { |
@@ -4928,7 +5020,7 @@ |
break; |
} |
} |
- set_neon_register(Vd, q_data); |
+ set_q_register(Vd, q_data); |
} else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 7) == 0) { |
if (instr->Bit(6) == 0) { |
// vswp Dd, Dm. |
@@ -4944,10 +5036,10 @@ |
uint32_t dval[4], mval[4]; |
int vd = instr->VFPDRegValue(kSimd128Precision); |
int vm = instr->VFPMRegValue(kSimd128Precision); |
- get_neon_register(vd, dval); |
- get_neon_register(vm, mval); |
- set_neon_register(vm, dval); |
- set_neon_register(vd, mval); |
+ get_q_register(vd, dval); |
+ get_q_register(vm, mval); |
+ set_q_register(vm, dval); |
+ set_q_register(vd, mval); |
} |
} else if (instr->Bits(11, 7) == 0x18) { |
// vdup.32 Qd, Sm. |
@@ -4957,15 +5049,15 @@ |
uint32_t s_data = get_s_register(vm * 2 + index); |
uint32_t q_data[4]; |
for (int i = 0; i < 4; i++) q_data[i] = s_data; |
- set_neon_register(vd, q_data); |
+ set_q_register(vd, q_data); |
} else if (instr->Bits(19, 16) == 0 && instr->Bits(11, 6) == 0x17) { |
// vmvn Qd, Qm. |
int vd = instr->VFPDRegValue(kSimd128Precision); |
int vm = instr->VFPMRegValue(kSimd128Precision); |
uint32_t q_data[4]; |
- get_neon_register(vm, q_data); |
+ get_q_register(vm, q_data); |
for (int i = 0; i < 4; i++) q_data[i] = ~q_data[i]; |
- set_neon_register(vd, q_data); |
+ set_q_register(vd, q_data); |
} else if (instr->Bits(11, 10) == 0x2) { |
// vtb[l,x] Dd, <list>, Dm. |
int vd = instr->VFPDRegValue(kDoublePrecision); |
@@ -4990,79 +5082,108 @@ |
} |
} |
set_d_register(vd, &result); |
- } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 8) == 0x1) { |
+ } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 8) == 0x1 && |
+ instr->Bit(6) == 1) { |
NeonSize size = static_cast<NeonSize>(instr->Bits(19, 18)); |
- if (instr->Bit(6) == 0) { |
- int Vd = instr->VFPDRegValue(kDoublePrecision); |
- int Vm = instr->VFPMRegValue(kDoublePrecision); |
- if (instr->Bit(7) == 1) { |
- // vzip.<size> Dd, Dm. |
- switch (size) { |
- case Neon8: |
- Zip<uint8_t, kDoubleSize>(this, Vd, Vm); |
- break; |
- case Neon16: |
- Zip<uint16_t, kDoubleSize>(this, Vd, Vm); |
- break; |
- case Neon32: |
- Zip<uint32_t, kDoubleSize>(this, Vd, Vm); |
- break; |
- default: |
- UNREACHABLE(); |
- break; |
- } |
- } else { |
- // vuzp.<size> Dd, Dm. |
- switch (size) { |
- case Neon8: |
- Unzip<uint8_t, kDoubleSize>(this, Vd, Vm); |
- break; |
- case Neon16: |
- Unzip<uint16_t, kDoubleSize>(this, Vd, Vm); |
- break; |
- case Neon32: |
- Unzip<uint32_t, kDoubleSize>(this, Vd, Vm); |
- break; |
- default: |
- UNREACHABLE(); |
- break; |
- } |
+ int Vd = instr->VFPDRegValue(kSimd128Precision); |
+ int Vm = instr->VFPMRegValue(kSimd128Precision); |
+ if (instr->Bit(7) == 1) { |
+ // vzip.<size> Qd, Qm. |
+ switch (size) { |
+ case Neon8: { |
+ uint8_t src1[16], src2[16], dst1[16], dst2[16]; |
+ get_q_register(Vd, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ dst1[i * 2] = src1[i]; |
+ dst1[i * 2 + 1] = src2[i]; |
+ dst2[i * 2] = src1[i + 8]; |
+ dst2[i * 2 + 1] = src2[i + 8]; |
+ } |
+ set_q_register(Vd, dst1); |
+ set_q_register(Vm, dst2); |
+ break; |
+ } |
+ case Neon16: { |
+ uint16_t src1[8], src2[8], dst1[8], dst2[8]; |
+ get_q_register(Vd, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ dst1[i * 2] = src1[i]; |
+ dst1[i * 2 + 1] = src2[i]; |
+ dst2[i * 2] = src1[i + 4]; |
+ dst2[i * 2 + 1] = src2[i + 4]; |
+ } |
+ set_q_register(Vd, dst1); |
+ set_q_register(Vm, dst2); |
+ break; |
+ } |
+ case Neon32: { |
+ uint32_t src1[4], src2[4], dst1[4], dst2[4]; |
+ get_q_register(Vd, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 2; i++) { |
+ dst1[i * 2] = src1[i]; |
+ dst1[i * 2 + 1] = src2[i]; |
+ dst2[i * 2] = src1[i + 2]; |
+ dst2[i * 2 + 1] = src2[i + 2]; |
+ } |
+ set_q_register(Vd, dst1); |
+ set_q_register(Vm, dst2); |
+ break; |
+ } |
+ default: |
+ UNREACHABLE(); |
+ break; |
} |
} else { |
- int Vd = instr->VFPDRegValue(kSimd128Precision); |
- int Vm = instr->VFPMRegValue(kSimd128Precision); |
- if (instr->Bit(7) == 1) { |
- // vzip.<size> Qd, Qm. |
- switch (size) { |
- case Neon8: |
- Zip<uint8_t, kSimd128Size>(this, Vd, Vm); |
- break; |
- case Neon16: |
- Zip<uint16_t, kSimd128Size>(this, Vd, Vm); |
- break; |
- case Neon32: |
- Zip<uint32_t, kSimd128Size>(this, Vd, Vm); |
- break; |
- default: |
- UNREACHABLE(); |
- break; |
- } |
- } else { |
- // vuzp.<size> Qd, Qm. |
- switch (size) { |
- case Neon8: |
- Unzip<uint8_t, kSimd128Size>(this, Vd, Vm); |
- break; |
- case Neon16: |
- Unzip<uint16_t, kSimd128Size>(this, Vd, Vm); |
- break; |
- case Neon32: |
- Unzip<uint32_t, kSimd128Size>(this, Vd, Vm); |
- break; |
- default: |
- UNREACHABLE(); |
- break; |
- } |
+ // vuzp.<size> Qd, Qm. |
+ switch (size) { |
+ case Neon8: { |
+ uint8_t src1[16], src2[16], dst1[16], dst2[16]; |
+ get_q_register(Vd, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 8; i++) { |
+ dst1[i] = src1[i * 2]; |
+ dst1[i + 8] = src2[i * 2]; |
+ dst2[i] = src1[i * 2 + 1]; |
+ dst2[i + 8] = src2[i * 2 + 1]; |
+ } |
+ set_q_register(Vd, dst1); |
+ set_q_register(Vm, dst2); |
+ break; |
+ } |
+ case Neon16: { |
+ uint16_t src1[8], src2[8], dst1[8], dst2[8]; |
+ get_q_register(Vd, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 4; i++) { |
+ dst1[i] = src1[i * 2]; |
+ dst1[i + 4] = src2[i * 2]; |
+ dst2[i] = src1[i * 2 + 1]; |
+ dst2[i + 4] = src2[i * 2 + 1]; |
+ } |
+ set_q_register(Vd, dst1); |
+ set_q_register(Vm, dst2); |
+ break; |
+ } |
+ case Neon32: { |
+ uint32_t src1[4], src2[4], dst1[4], dst2[4]; |
+ get_q_register(Vd, src1); |
+ get_q_register(Vm, src2); |
+ for (int i = 0; i < 2; i++) { |
+ dst1[i] = src1[i * 2]; |
+ dst1[i + 2] = src2[i * 2]; |
+ dst2[i] = src1[i * 2 + 1]; |
+ dst2[i + 2] = src2[i * 2 + 1]; |
+ } |
+ set_q_register(Vd, dst1); |
+ set_q_register(Vm, dst2); |
+ break; |
+ } |
+ default: |
+ UNREACHABLE(); |
+ break; |
} |
} |
} else if (instr->Bits(17, 16) == 0 && instr->Bits(11, 9) == 0) { |
@@ -5076,32 +5197,32 @@ |
case Neon16: { |
DCHECK_EQ(Neon8, size); |
uint8_t src[16]; |
- get_neon_register(Vm, src); |
+ get_q_register(Vm, src); |
for (int i = 0; i < 16; i += 2) { |
std::swap(src[i], src[i + 1]); |
} |
- set_neon_register(Vd, src); |
+ set_q_register(Vd, src); |
break; |
} |
case Neon32: { |
switch (size) { |
case Neon16: { |
uint16_t src[8]; |
- get_neon_register(Vm, src); |
+ get_q_register(Vm, src); |
for (int i = 0; i < 8; i += 2) { |
std::swap(src[i], src[i + 1]); |
} |
- set_neon_register(Vd, src); |
+ set_q_register(Vd, src); |
break; |
} |
case Neon8: { |
uint8_t src[16]; |
- get_neon_register(Vm, src); |
+ get_q_register(Vm, src); |
for (int i = 0; i < 4; i++) { |
std::swap(src[i * 4], src[i * 4 + 3]); |
std::swap(src[i * 4 + 1], src[i * 4 + 2]); |
} |
- set_neon_register(Vd, src); |
+ set_q_register(Vd, src); |
break; |
} |
default: |
@@ -5114,30 +5235,30 @@ |
switch (size) { |
case Neon32: { |
uint32_t src[4]; |
- get_neon_register(Vm, src); |
+ get_q_register(Vm, src); |
std::swap(src[0], src[1]); |
std::swap(src[2], src[3]); |
- set_neon_register(Vd, src); |
+ set_q_register(Vd, src); |
break; |
} |
case Neon16: { |
uint16_t src[8]; |
- get_neon_register(Vm, src); |
+ get_q_register(Vm, src); |
for (int i = 0; i < 4; i++) { |
std::swap(src[i * 4], src[i * 4 + 3]); |
std::swap(src[i * 4 + 1], src[i * 4 + 2]); |
} |
- set_neon_register(Vd, src); |
+ set_q_register(Vd, src); |
break; |
} |
case Neon8: { |
uint8_t src[16]; |
- get_neon_register(Vm, src); |
+ get_q_register(Vm, src); |
for (int i = 0; i < 4; i++) { |
std::swap(src[i], src[7 - i]); |
std::swap(src[i + 8], src[15 - i]); |
} |
- set_neon_register(Vd, src); |
+ set_q_register(Vd, src); |
break; |
} |
default: |
@@ -5150,44 +5271,48 @@ |
UNREACHABLE(); |
break; |
} |
- } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 7) == 0x1) { |
+ } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 6) == 0x3) { |
+ int Vd = instr->VFPDRegValue(kSimd128Precision); |
+ int Vm = instr->VFPMRegValue(kSimd128Precision); |
NeonSize size = static_cast<NeonSize>(instr->Bits(19, 18)); |
- if (instr->Bit(6) == 0) { |
- int Vd = instr->VFPDRegValue(kDoublePrecision); |
- int Vm = instr->VFPMRegValue(kDoublePrecision); |
- // vtrn.<size> Dd, Dm. |
- switch (size) { |
- case Neon8: |
- Transpose<uint8_t, kDoubleSize>(this, Vd, Vm); |
- break; |
- case Neon16: |
- Transpose<uint16_t, kDoubleSize>(this, Vd, Vm); |
- break; |
- case Neon32: |
- Transpose<uint32_t, kDoubleSize>(this, Vd, Vm); |
- break; |
- default: |
- UNREACHABLE(); |
- break; |
- } |
- } else { |
- int Vd = instr->VFPDRegValue(kSimd128Precision); |
- int Vm = instr->VFPMRegValue(kSimd128Precision); |
- // vtrn.<size> Qd, Qm. |
- switch (size) { |
- case Neon8: |
- Transpose<uint8_t, kSimd128Size>(this, Vd, Vm); |
- break; |
- case Neon16: |
- Transpose<uint16_t, kSimd128Size>(this, Vd, Vm); |
- break; |
- case Neon32: |
- Transpose<uint32_t, kSimd128Size>(this, Vd, Vm); |
- break; |
- default: |
- UNREACHABLE(); |
- break; |
- } |
+ // vtrn.<size> Qd, Qm. |
+ switch (size) { |
+ case Neon8: { |
+ uint8_t src[16], dst[16]; |
+ get_q_register(Vd, dst); |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 8; i++) { |
+ std::swap(dst[2 * i + 1], src[2 * i]); |
+ } |
+ set_q_register(Vd, dst); |
+ set_q_register(Vm, src); |
+ break; |
+ } |
+ case Neon16: { |
+ uint16_t src[8], dst[8]; |
+ get_q_register(Vd, dst); |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 4; i++) { |
+ std::swap(dst[2 * i + 1], src[2 * i]); |
+ } |
+ set_q_register(Vd, dst); |
+ set_q_register(Vm, src); |
+ break; |
+ } |
+ case Neon32: { |
+ uint32_t src[4], dst[4]; |
+ get_q_register(Vd, dst); |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 2; i++) { |
+ std::swap(dst[2 * i + 1], src[2 * i]); |
+ } |
+ set_q_register(Vd, dst); |
+ set_q_register(Vm, src); |
+ break; |
+ } |
+ default: |
+ UNREACHABLE(); |
+ break; |
} |
} else if (instr->Bits(17, 16) == 0x1 && instr->Bit(11) == 0) { |
int Vd = instr->VFPDRegValue(kSimd128Precision); |
@@ -5198,23 +5323,41 @@ |
if (instr->Bit(10) != 0) { |
// floating point (clear sign bits) |
uint32_t src[4]; |
- get_neon_register(Vm, src); |
+ get_q_register(Vm, src); |
for (int i = 0; i < 4; i++) { |
src[i] &= ~0x80000000; |
} |
- set_neon_register(Vd, src); |
+ set_q_register(Vd, src); |
} else { |
// signed integer |
switch (size) { |
- case Neon8: |
- Abs<int8_t, kSimd128Size>(this, Vd, Vm); |
+ case Neon8: { |
+ int8_t src[16]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 16; i++) { |
+ src[i] = std::abs(src[i]); |
+ } |
+ set_q_register(Vd, src); |
break; |
- case Neon16: |
- Abs<int16_t, kSimd128Size>(this, Vd, Vm); |
+ } |
+ case Neon16: { |
+ int16_t src[8]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 8; i++) { |
+ src[i] = std::abs(src[i]); |
+ } |
+ set_q_register(Vd, src); |
break; |
- case Neon32: |
- Abs<int32_t, kSimd128Size>(this, Vd, Vm); |
+ } |
+ case Neon32: { |
+ int32_t src[4]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 4; i++) { |
+ src[i] = std::abs(src[i]); |
+ } |
+ set_q_register(Vd, src); |
break; |
+ } |
default: |
UNIMPLEMENTED(); |
break; |
@@ -5225,23 +5368,40 @@ |
if (instr->Bit(10) != 0) { |
// floating point (toggle sign bits) |
uint32_t src[4]; |
- get_neon_register(Vm, src); |
+ get_q_register(Vm, src); |
for (int i = 0; i < 4; i++) { |
src[i] ^= 0x80000000; |
} |
- set_neon_register(Vd, src); |
+ set_q_register(Vd, src); |
} else { |
// signed integer |
switch (size) { |
- case Neon8: |
- Neg<int8_t, kSimd128Size>(this, Vd, Vm); |
+ case Neon8: { |
+ int8_t src[16]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 16; i++) { |
+ src[i] = -src[i]; |
+ } |
+ set_q_register(Vd, src); |
break; |
+ } |
case Neon16: |
- Neg<int16_t, kSimd128Size>(this, Vd, Vm); |
+ int16_t src[8]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 8; i++) { |
+ src[i] = -src[i]; |
+ } |
+ set_q_register(Vd, src); |
break; |
- case Neon32: |
- Neg<int32_t, kSimd128Size>(this, Vd, Vm); |
+ case Neon32: { |
+ int32_t src[4]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 4; i++) { |
+ src[i] = -src[i]; |
+ } |
+ set_q_register(Vd, src); |
break; |
+ } |
default: |
UNIMPLEMENTED(); |
break; |
@@ -5255,7 +5415,7 @@ |
int Vd = instr->VFPDRegValue(kSimd128Precision); |
int Vm = instr->VFPMRegValue(kSimd128Precision); |
uint32_t src[4]; |
- get_neon_register(Vm, src); |
+ get_q_register(Vm, src); |
if (instr->Bit(7) == 0) { |
for (int i = 0; i < 4; i++) { |
float denom = bit_cast<float>(src[i]); |
@@ -5273,7 +5433,7 @@ |
src[i] = bit_cast<uint32_t>(result); |
} |
} |
- set_neon_register(Vd, src); |
+ set_q_register(Vd, src); |
} else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 8) == 0x2 && |
instr->Bits(7, 6) != 0) { |
// vqmovn.<type><size> Dd, Qm. |
@@ -5321,15 +5481,33 @@ |
int Vm = instr->VFPMRegValue(kSimd128Precision); |
NeonSize ns = static_cast<NeonSize>(size / 16); |
switch (ns) { |
- case Neon8: |
- ShiftRight<uint8_t, kSimd128Size>(this, Vd, Vm, shift); |
+ case Neon8: { |
+ uint8_t src[16]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 16; i++) { |
+ src[i] >>= shift; |
+ } |
+ set_q_register(Vd, src); |
break; |
- case Neon16: |
- ShiftRight<uint16_t, kSimd128Size>(this, Vd, Vm, shift); |
+ } |
+ case Neon16: { |
+ uint16_t src[8]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 8; i++) { |
+ src[i] >>= shift; |
+ } |
+ set_q_register(Vd, src); |
break; |
- case Neon32: |
- ShiftRight<uint32_t, kSimd128Size>(this, Vd, Vm, shift); |
+ } |
+ case Neon32: { |
+ uint32_t src[4]; |
+ get_q_register(Vm, src); |
+ for (int i = 0; i < 4; i++) { |
+ src[i] >>= shift; |
+ } |
+ set_q_register(Vd, src); |
break; |
+ } |
default: |
UNREACHABLE(); |
break; |