| Index: src/arm/simulator-arm.cc
|
| diff --git a/src/arm/simulator-arm.cc b/src/arm/simulator-arm.cc
|
| index a2f3536ad3d4b0114e97b3807dfdf0e44787507f..2bf2a6a3a52b549a548bc384081e975344b63a69 100644
|
| --- a/src/arm/simulator-arm.cc
|
| +++ b/src/arm/simulator-arm.cc
|
| @@ -896,20 +896,28 @@
|
| memcpy(vfp_registers_ + dreg * 2, value, sizeof(*value) * 2);
|
| }
|
|
|
| -template <typename T, int SIZE>
|
| -void Simulator::get_neon_register(int reg, T (&value)[SIZE / sizeof(T)]) {
|
| - DCHECK(SIZE == kSimd128Size || SIZE == kDoubleSize);
|
| - DCHECK_LE(0, reg);
|
| - DCHECK_GT(SIZE == kSimd128Size ? num_q_registers : num_d_registers, reg);
|
| - memcpy(value, vfp_registers_ + reg * (SIZE / 4), SIZE);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void Simulator::set_neon_register(int reg, const T (&value)[SIZE / sizeof(T)]) {
|
| - DCHECK(SIZE == kSimd128Size || SIZE == kDoubleSize);
|
| - DCHECK_LE(0, reg);
|
| - DCHECK_GT(SIZE == kSimd128Size ? num_q_registers : num_d_registers, reg);
|
| - memcpy(vfp_registers_ + reg * (SIZE / 4), value, SIZE);
|
| +template <typename T>
|
| +void Simulator::get_d_register(int dreg, T* value) {
|
| + DCHECK((dreg >= 0) && (dreg < num_d_registers));
|
| + memcpy(value, vfp_registers_ + dreg * 2, kDoubleSize);
|
| +}
|
| +
|
| +template <typename T>
|
| +void Simulator::set_d_register(int dreg, const T* value) {
|
| + DCHECK((dreg >= 0) && (dreg < num_d_registers));
|
| + memcpy(vfp_registers_ + dreg * 2, value, kDoubleSize);
|
| +}
|
| +
|
| +template <typename T>
|
| +void Simulator::get_q_register(int qreg, T* value) {
|
| + DCHECK((qreg >= 0) && (qreg < num_q_registers));
|
| + memcpy(value, vfp_registers_ + qreg * 4, kSimd128Size);
|
| +}
|
| +
|
| +template <typename T>
|
| +void Simulator::set_q_register(int qreg, const T* value) {
|
| + DCHECK((qreg >= 0) && (qreg < num_q_registers));
|
| + memcpy(vfp_registers_ + qreg * 4, value, kSimd128Size);
|
| }
|
|
|
| // Raw access to the PC register.
|
| @@ -3500,7 +3508,7 @@
|
| UNREACHABLE();
|
| break;
|
| }
|
| - set_neon_register(vd, q_data);
|
| + set_q_register(vd, q_data);
|
| }
|
| } else if ((instr->VLValue() == 0x1) && (instr->VCValue() == 0x1)) {
|
| // vmov (scalar to ARM core register)
|
| @@ -3987,6 +3995,7 @@
|
| }
|
|
|
| // Templated operations for NEON instructions.
|
| +// TODO(bbudge) Add more templates for use in DecodeSpecialCondition.
|
| template <typename T, typename U>
|
| U Widen(T value) {
|
| static_assert(sizeof(int64_t) > sizeof(T), "T must be int32_t or smaller");
|
| @@ -4016,38 +4025,21 @@
|
| return static_cast<T>(clamped);
|
| }
|
|
|
| +template <typename T>
|
| +T MinMax(T a, T b, bool is_min) {
|
| + return is_min ? std::min(a, b) : std::max(a, b);
|
| +}
|
| +
|
| template <typename T, typename U>
|
| void Widen(Simulator* simulator, int Vd, int Vm) {
|
| static const int kLanes = 8 / sizeof(T);
|
| T src[kLanes];
|
| U dst[kLanes];
|
| - simulator->get_neon_register<T, kDoubleSize>(Vm, src);
|
| + simulator->get_d_register(Vm, src);
|
| for (int i = 0; i < kLanes; i++) {
|
| dst[i] = Widen<T, U>(src[i]);
|
| }
|
| - simulator->set_neon_register(Vd, dst);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void Abs(Simulator* simulator, int Vd, int Vm) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vm, src);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src[i] = std::abs(src[i]);
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void Neg(Simulator* simulator, int Vd, int Vm) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vm, src);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src[i] = -src[i];
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src);
|
| + simulator->set_q_register(Vd, dst);
|
| }
|
|
|
| template <typename T, typename U>
|
| @@ -4055,222 +4047,35 @@
|
| static const int kLanes = 16 / sizeof(T);
|
| T src[kLanes];
|
| U dst[kLanes];
|
| - simulator->get_neon_register(Vm, src);
|
| + simulator->get_q_register(Vm, src);
|
| for (int i = 0; i < kLanes; i++) {
|
| dst[i] = Narrow<T, U>(Clamp<U>(src[i]));
|
| }
|
| - simulator->set_neon_register<U, kDoubleSize>(Vd, dst);
|
| + simulator->set_d_register(Vd, dst);
|
| }
|
|
|
| template <typename T>
|
| void AddSaturate(Simulator* simulator, int Vd, int Vm, int Vn) {
|
| static const int kLanes = 16 / sizeof(T);
|
| T src1[kLanes], src2[kLanes];
|
| - simulator->get_neon_register(Vn, src1);
|
| - simulator->get_neon_register(Vm, src2);
|
| + simulator->get_q_register(Vn, src1);
|
| + simulator->get_q_register(Vm, src2);
|
| for (int i = 0; i < kLanes; i++) {
|
| src1[i] = Clamp<T>(Widen<T, int64_t>(src1[i]) + Widen<T, int64_t>(src2[i]));
|
| }
|
| - simulator->set_neon_register(Vd, src1);
|
| + simulator->set_q_register(Vd, src1);
|
| }
|
|
|
| template <typename T>
|
| void SubSaturate(Simulator* simulator, int Vd, int Vm, int Vn) {
|
| static const int kLanes = 16 / sizeof(T);
|
| T src1[kLanes], src2[kLanes];
|
| - simulator->get_neon_register(Vn, src1);
|
| - simulator->get_neon_register(Vm, src2);
|
| + simulator->get_q_register(Vn, src1);
|
| + simulator->get_q_register(Vm, src2);
|
| for (int i = 0; i < kLanes; i++) {
|
| src1[i] = Clamp<T>(Widen<T, int64_t>(src1[i]) - Widen<T, int64_t>(src2[i]));
|
| }
|
| - simulator->set_neon_register(Vd, src1);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void Zip(Simulator* simulator, int Vd, int Vm) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - static const int kPairs = kElems / 2;
|
| - T src1[kElems], src2[kElems], dst1[kElems], dst2[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vd, src1);
|
| - simulator->get_neon_register<T, SIZE>(Vm, src2);
|
| - for (int i = 0; i < kPairs; i++) {
|
| - dst1[i * 2] = src1[i];
|
| - dst1[i * 2 + 1] = src2[i];
|
| - dst2[i * 2] = src1[i + kPairs];
|
| - dst2[i * 2 + 1] = src2[i + kPairs];
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, dst1);
|
| - simulator->set_neon_register<T, SIZE>(Vm, dst2);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void Unzip(Simulator* simulator, int Vd, int Vm) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - static const int kPairs = kElems / 2;
|
| - T src1[kElems], src2[kElems], dst1[kElems], dst2[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vd, src1);
|
| - simulator->get_neon_register<T, SIZE>(Vm, src2);
|
| - for (int i = 0; i < kPairs; i++) {
|
| - dst1[i] = src1[i * 2];
|
| - dst1[i + kPairs] = src2[i * 2];
|
| - dst2[i] = src1[i * 2 + 1];
|
| - dst2[i + kPairs] = src2[i * 2 + 1];
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, dst1);
|
| - simulator->set_neon_register<T, SIZE>(Vm, dst2);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void Transpose(Simulator* simulator, int Vd, int Vm) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - static const int kPairs = kElems / 2;
|
| - T src1[kElems], src2[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vd, src1);
|
| - simulator->get_neon_register<T, SIZE>(Vm, src2);
|
| - for (int i = 0; i < kPairs; i++) {
|
| - std::swap(src1[2 * i + 1], src2[2 * i]);
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src1);
|
| - simulator->set_neon_register<T, SIZE>(Vm, src2);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void Test(Simulator* simulator, int Vd, int Vm, int Vn) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src1[kElems], src2[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vn, src1);
|
| - simulator->get_neon_register<T, SIZE>(Vm, src2);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src1[i] = (src1[i] & src2[i]) != 0 ? -1 : 0;
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src1);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void Add(Simulator* simulator, int Vd, int Vm, int Vn) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src1[kElems], src2[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vn, src1);
|
| - simulator->get_neon_register<T, SIZE>(Vm, src2);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src1[i] += src2[i];
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src1);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void Sub(Simulator* simulator, int Vd, int Vm, int Vn) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src1[kElems], src2[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vn, src1);
|
| - simulator->get_neon_register<T, SIZE>(Vm, src2);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src1[i] -= src2[i];
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src1);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void Mul(Simulator* simulator, int Vd, int Vm, int Vn) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src1[kElems], src2[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vn, src1);
|
| - simulator->get_neon_register<T, SIZE>(Vm, src2);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src1[i] *= src2[i];
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src1);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void ShiftLeft(Simulator* simulator, int Vd, int Vm, int shift) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vm, src);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src[i] <<= shift;
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void ShiftRight(Simulator* simulator, int Vd, int Vm, int shift) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vm, src);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src[i] >>= shift;
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void ArithmeticShiftRight(Simulator* simulator, int Vd, int Vm, int shift) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vm, src);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src[i] = ArithmeticShiftRight(src[i], shift);
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void CompareEqual(Simulator* simulator, int Vd, int Vm, int Vn) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src1[kElems], src2[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vn, src1);
|
| - simulator->get_neon_register<T, SIZE>(Vm, src2);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src1[i] = src1[i] == src2[i] ? -1 : 0;
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src1);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void CompareGreater(Simulator* simulator, int Vd, int Vm, int Vn, bool ge) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src1[kElems], src2[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vn, src1);
|
| - simulator->get_neon_register<T, SIZE>(Vm, src2);
|
| - for (int i = 0; i < kElems; i++) {
|
| - if (ge)
|
| - src1[i] = src1[i] >= src2[i] ? -1 : 0;
|
| - else
|
| - src1[i] = src1[i] > src2[i] ? -1 : 0;
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src1);
|
| -}
|
| -
|
| -template <typename T>
|
| -T MinMax(T a, T b, bool is_min) {
|
| - return is_min ? std::min(a, b) : std::max(a, b);
|
| -}
|
| -
|
| -template <typename T, int SIZE>
|
| -void MinMax(Simulator* simulator, int Vd, int Vm, int Vn, bool min) {
|
| - static const int kElems = SIZE / sizeof(T);
|
| - T src1[kElems], src2[kElems];
|
| - simulator->get_neon_register<T, SIZE>(Vn, src1);
|
| - simulator->get_neon_register<T, SIZE>(Vm, src2);
|
| - for (int i = 0; i < kElems; i++) {
|
| - src1[i] = MinMax(src1[i], src2[i], min);
|
| - }
|
| - simulator->set_neon_register<T, SIZE>(Vd, src1);
|
| -}
|
| -
|
| -template <typename T>
|
| -void PairwiseMinMax(Simulator* simulator, int Vd, int Vm, int Vn, bool min) {
|
| - static const int kElems = kDoubleSize / sizeof(T);
|
| - static const int kPairs = kElems / 2;
|
| - T dst[kElems], src1[kElems], src2[kElems];
|
| - simulator->get_neon_register<T, kDoubleSize>(Vn, src1);
|
| - simulator->get_neon_register<T, kDoubleSize>(Vm, src2);
|
| - for (int i = 0; i < kPairs; i++) {
|
| - dst[i] = MinMax(src1[i * 2], src1[i * 2 + 1], min);
|
| - dst[i + kPairs] = MinMax(src2[i * 2], src2[i * 2 + 1], min);
|
| - }
|
| - simulator->set_neon_register<T, kDoubleSize>(Vd, dst);
|
| + simulator->set_q_register(Vd, src1);
|
| }
|
|
|
| void Simulator::DecodeSpecialCondition(Instruction* instr) {
|
| @@ -4316,25 +4121,25 @@
|
| // vmov Qd, Qm.
|
| // vorr, Qd, Qm, Qn.
|
| uint32_t src1[4];
|
| - get_neon_register(Vm, src1);
|
| + get_q_register(Vm, src1);
|
| if (Vm != Vn) {
|
| uint32_t src2[4];
|
| - get_neon_register(Vn, src2);
|
| + get_q_register(Vn, src2);
|
| for (int i = 0; i < 4; i++) {
|
| src1[i] = src1[i] | src2[i];
|
| }
|
| }
|
| - set_neon_register(Vd, src1);
|
| + set_q_register(Vd, src1);
|
| } else if (instr->Bits(21, 20) == 0 && instr->Bit(6) == 1 &&
|
| instr->Bit(4) == 1) {
|
| // vand Qd, Qm, Qn.
|
| uint32_t src1[4], src2[4];
|
| - get_neon_register(Vn, src1);
|
| - get_neon_register(Vm, src2);
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| for (int i = 0; i < 4; i++) {
|
| src1[i] = src1[i] & src2[i];
|
| }
|
| - set_neon_register(Vd, src1);
|
| + set_q_register(Vd, src1);
|
| } else {
|
| UNIMPLEMENTED();
|
| }
|
| @@ -4368,15 +4173,45 @@
|
| bool ge = instr->Bit(4) == 1;
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
| switch (size) {
|
| - case Neon8:
|
| - CompareGreater<int8_t, kSimd128Size>(this, Vd, Vm, Vn, ge);
|
| + case Neon8: {
|
| + int8_t src1[16], src2[16];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 16; i++) {
|
| + if (ge)
|
| + src1[i] = src1[i] >= src2[i] ? 0xFF : 0;
|
| + else
|
| + src1[i] = src1[i] > src2[i] ? 0xFF : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon16:
|
| - CompareGreater<int16_t, kSimd128Size>(this, Vd, Vm, Vn, ge);
|
| + }
|
| + case Neon16: {
|
| + int16_t src1[8], src2[8];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + if (ge)
|
| + src1[i] = src1[i] >= src2[i] ? 0xFFFF : 0;
|
| + else
|
| + src1[i] = src1[i] > src2[i] ? 0xFFFF : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon32:
|
| - CompareGreater<int32_t, kSimd128Size>(this, Vd, Vm, Vn, ge);
|
| + }
|
| + case Neon32: {
|
| + int32_t src1[4], src2[4];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + if (ge)
|
| + src1[i] = src1[i] >= src2[i] ? 0xFFFFFFFF : 0;
|
| + else
|
| + src1[i] = src1[i] > src2[i] ? 0xFFFFFFFF : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4388,15 +4223,36 @@
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
| bool min = instr->Bit(4) != 0;
|
| switch (size) {
|
| - case Neon8:
|
| - MinMax<int8_t, kSimd128Size>(this, Vd, Vm, Vn, min);
|
| + case Neon8: {
|
| + int8_t src1[16], src2[16];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 16; i++) {
|
| + src1[i] = MinMax(src1[i], src2[i], min);
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon16:
|
| - MinMax<int16_t, kSimd128Size>(this, Vd, Vm, Vn, min);
|
| + }
|
| + case Neon16: {
|
| + int16_t src1[8], src2[8];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + src1[i] = MinMax(src1[i], src2[i], min);
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon32:
|
| - MinMax<int32_t, kSimd128Size>(this, Vd, Vm, Vn, min);
|
| + }
|
| + case Neon32: {
|
| + int32_t src1[4], src2[4];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + src1[i] = MinMax(src1[i], src2[i], min);
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4409,15 +4265,36 @@
|
| if (instr->Bit(4) == 0) {
|
| // vadd.i<size> Qd, Qm, Qn.
|
| switch (size) {
|
| - case Neon8:
|
| - Add<uint8_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + case Neon8: {
|
| + uint8_t src1[16], src2[16];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 16; i++) {
|
| + src1[i] += src2[i];
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon16:
|
| - Add<uint16_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + }
|
| + case Neon16: {
|
| + uint16_t src1[8], src2[8];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + src1[i] += src2[i];
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon32:
|
| - Add<uint32_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + }
|
| + case Neon32: {
|
| + uint32_t src1[4], src2[4];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + src1[i] += src2[i];
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4425,15 +4302,36 @@
|
| } else {
|
| // vtst.i<size> Qd, Qm, Qn.
|
| switch (size) {
|
| - case Neon8:
|
| - Test<uint8_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + case Neon8: {
|
| + uint8_t src1[16], src2[16];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 16; i++) {
|
| + src1[i] = (src1[i] & src2[i]) != 0 ? 0xFFu : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon16:
|
| - Test<uint16_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + }
|
| + case Neon16: {
|
| + uint16_t src1[8], src2[8];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + src1[i] = (src1[i] & src2[i]) != 0 ? 0xFFFFu : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon32:
|
| - Test<uint32_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + }
|
| + case Neon32: {
|
| + uint32_t src1[4], src2[4];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + src1[i] = (src1[i] & src2[i]) != 0 ? 0xFFFFFFFFu : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4446,15 +4344,36 @@
|
| // vmul.i<size> Qd, Qm, Qn.
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
| switch (size) {
|
| - case Neon8:
|
| - Mul<uint8_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + case Neon8: {
|
| + uint8_t src1[16], src2[16];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 16; i++) {
|
| + src1[i] *= src2[i];
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon16:
|
| - Mul<uint16_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + }
|
| + case Neon16: {
|
| + uint16_t src1[8], src2[8];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + src1[i] *= src2[i];
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon32:
|
| - Mul<uint32_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + }
|
| + case Neon32: {
|
| + uint32_t src1[4], src2[4];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + src1[i] *= src2[i];
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4469,15 +4388,37 @@
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
| bool min = instr->Bit(4) != 0;
|
| switch (size) {
|
| - case Neon8:
|
| - PairwiseMinMax<int8_t>(this, Vd, Vm, Vn, min);
|
| + case Neon8: {
|
| + int8_t dst[8], src1[8], src2[8];
|
| + get_d_register(Vn, src1);
|
| + get_d_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + dst[i + 0] = MinMax(src1[i * 2], src1[i * 2 + 1], min);
|
| + dst[i + 4] = MinMax(src2[i * 2], src2[i * 2 + 1], min);
|
| + }
|
| + set_d_register(Vd, dst);
|
| break;
|
| - case Neon16:
|
| - PairwiseMinMax<int16_t>(this, Vd, Vm, Vn, min);
|
| + }
|
| + case Neon16: {
|
| + int16_t dst[4], src1[4], src2[4];
|
| + get_d_register(Vn, src1);
|
| + get_d_register(Vm, src2);
|
| + for (int i = 0; i < 2; i++) {
|
| + dst[i + 0] = MinMax(src1[i * 2], src1[i * 2 + 1], min);
|
| + dst[i + 2] = MinMax(src2[i * 2], src2[i * 2 + 1], min);
|
| + }
|
| + set_d_register(Vd, dst);
|
| break;
|
| - case Neon32:
|
| - PairwiseMinMax<int32_t>(this, Vd, Vm, Vn, min);
|
| + }
|
| + case Neon32: {
|
| + int32_t dst[2], src1[2], src2[2];
|
| + get_d_register(Vn, src1);
|
| + get_d_register(Vm, src2);
|
| + dst[0] = MinMax(src1[0], src1[1], min);
|
| + dst[1] = MinMax(src2[0], src2[1], min);
|
| + set_d_register(Vd, dst);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4487,8 +4428,8 @@
|
| case 0xd: {
|
| if (instr->Bit(4) == 0) {
|
| float src1[4], src2[4];
|
| - get_neon_register(Vn, src1);
|
| - get_neon_register(Vm, src2);
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| for (int i = 0; i < 4; i++) {
|
| if (instr->Bit(21) == 0) {
|
| // vadd.f32 Qd, Qm, Qn.
|
| @@ -4498,7 +4439,7 @@
|
| src1[i] = src1[i] - src2[i];
|
| }
|
| }
|
| - set_neon_register(Vd, src1);
|
| + set_q_register(Vd, src1);
|
| } else {
|
| UNIMPLEMENTED();
|
| }
|
| @@ -4508,13 +4449,13 @@
|
| if (instr->Bits(21, 20) == 0 && instr->Bit(4) == 0) {
|
| // vceq.f32.
|
| float src1[4], src2[4];
|
| - get_neon_register(Vn, src1);
|
| - get_neon_register(Vm, src2);
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| uint32_t dst[4];
|
| for (int i = 0; i < 4; i++) {
|
| dst[i] = (src1[i] == src2[i]) ? 0xFFFFFFFF : 0;
|
| }
|
| - set_neon_register(Vd, dst);
|
| + set_q_register(Vd, dst);
|
| } else {
|
| UNIMPLEMENTED();
|
| }
|
| @@ -4523,8 +4464,8 @@
|
| case 0xf: {
|
| if (instr->Bit(20) == 0 && instr->Bit(6) == 1) {
|
| float src1[4], src2[4];
|
| - get_neon_register(Vn, src1);
|
| - get_neon_register(Vm, src2);
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| if (instr->Bit(4) == 1) {
|
| if (instr->Bit(21) == 0) {
|
| // vrecps.f32 Qd, Qm, Qn.
|
| @@ -4544,7 +4485,7 @@
|
| src1[i] = MinMax(src1[i], src2[i], min);
|
| }
|
| }
|
| - set_neon_register(Vd, src1);
|
| + set_q_register(Vd, src1);
|
| } else {
|
| UNIMPLEMENTED();
|
| }
|
| @@ -4585,8 +4526,8 @@
|
| int Vm = instr->VFPMRegValue(kSimd128Precision);
|
| int Vn = instr->VFPNRegValue(kSimd128Precision);
|
| uint8_t src1[16], src2[16], dst[16];
|
| - get_neon_register(Vn, src1);
|
| - get_neon_register(Vm, src2);
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| int boundary = kSimd128Size - imm4;
|
| int i = 0;
|
| for (; i < boundary; i++) {
|
| @@ -4595,7 +4536,7 @@
|
| for (; i < 16; i++) {
|
| dst[i] = src2[i - boundary];
|
| }
|
| - set_neon_register(Vd, dst);
|
| + set_q_register(Vd, dst);
|
| } else if (instr->Bits(11, 7) == 0xA && instr->Bit(4) == 1) {
|
| // vshl.i<size> Qd, Qm, shift
|
| int size = base::bits::RoundDownToPowerOfTwo32(instr->Bits(21, 16));
|
| @@ -4604,15 +4545,33 @@
|
| int Vm = instr->VFPMRegValue(kSimd128Precision);
|
| NeonSize ns = static_cast<NeonSize>(size / 16);
|
| switch (ns) {
|
| - case Neon8:
|
| - ShiftLeft<uint8_t, kSimd128Size>(this, Vd, Vm, shift);
|
| + case Neon8: {
|
| + uint8_t src[16];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 16; i++) {
|
| + src[i] <<= shift;
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| - case Neon16:
|
| - ShiftLeft<uint16_t, kSimd128Size>(this, Vd, Vm, shift);
|
| + }
|
| + case Neon16: {
|
| + uint16_t src[8];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 8; i++) {
|
| + src[i] <<= shift;
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| - case Neon32:
|
| - ShiftLeft<uint32_t, kSimd128Size>(this, Vd, Vm, shift);
|
| + }
|
| + case Neon32: {
|
| + uint32_t src[4];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 4; i++) {
|
| + src[i] <<= shift;
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4625,15 +4584,33 @@
|
| int Vm = instr->VFPMRegValue(kSimd128Precision);
|
| NeonSize ns = static_cast<NeonSize>(size / 16);
|
| switch (ns) {
|
| - case Neon8:
|
| - ArithmeticShiftRight<int8_t, kSimd128Size>(this, Vd, Vm, shift);
|
| + case Neon8: {
|
| + int8_t src[16];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 16; i++) {
|
| + src[i] = ArithmeticShiftRight(src[i], shift);
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| - case Neon16:
|
| - ArithmeticShiftRight<int16_t, kSimd128Size>(this, Vd, Vm, shift);
|
| + }
|
| + case Neon16: {
|
| + int16_t src[8];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 8; i++) {
|
| + src[i] = ArithmeticShiftRight(src[i], shift);
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| - case Neon32:
|
| - ArithmeticShiftRight<int32_t, kSimd128Size>(this, Vd, Vm, shift);
|
| + }
|
| + case Neon32: {
|
| + int32_t src[4];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 4; i++) {
|
| + src[i] = ArithmeticShiftRight(src[i], shift);
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4681,13 +4658,13 @@
|
| if (instr->Bits(21, 20) == 1 && instr->Bit(4) == 1) {
|
| // vbsl.size Qd, Qm, Qn.
|
| uint32_t dst[4], src1[4], src2[4];
|
| - get_neon_register(Vd, dst);
|
| - get_neon_register(Vn, src1);
|
| - get_neon_register(Vm, src2);
|
| + get_q_register(Vd, dst);
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| for (int i = 0; i < 4; i++) {
|
| dst[i] = (dst[i] & src1[i]) | (~dst[i] & src2[i]);
|
| }
|
| - set_neon_register(Vd, dst);
|
| + set_q_register(Vd, dst);
|
| } else if (instr->Bits(21, 20) == 0 && instr->Bit(4) == 1) {
|
| if (instr->Bit(6) == 0) {
|
| // veor Dd, Dn, Dm
|
| @@ -4700,10 +4677,10 @@
|
| } else {
|
| // veor Qd, Qn, Qm
|
| uint32_t src1[4], src2[4];
|
| - get_neon_register(Vn, src1);
|
| - get_neon_register(Vm, src2);
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| for (int i = 0; i < 4; i++) src1[i] ^= src2[i];
|
| - set_neon_register(Vd, src1);
|
| + set_q_register(Vd, src1);
|
| }
|
| } else {
|
| UNIMPLEMENTED();
|
| @@ -4738,15 +4715,45 @@
|
| bool ge = instr->Bit(4) == 1;
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
| switch (size) {
|
| - case Neon8:
|
| - CompareGreater<uint8_t, kSimd128Size>(this, Vd, Vm, Vn, ge);
|
| + case Neon8: {
|
| + uint8_t src1[16], src2[16];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 16; i++) {
|
| + if (ge)
|
| + src1[i] = src1[i] >= src2[i] ? 0xFFu : 0;
|
| + else
|
| + src1[i] = src1[i] > src2[i] ? 0xFFu : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon16:
|
| - CompareGreater<uint16_t, kSimd128Size>(this, Vd, Vm, Vn, ge);
|
| + }
|
| + case Neon16: {
|
| + uint16_t src1[8], src2[8];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + if (ge)
|
| + src1[i] = src1[i] >= src2[i] ? 0xFFFFu : 0;
|
| + else
|
| + src1[i] = src1[i] > src2[i] ? 0xFFFFu : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon32:
|
| - CompareGreater<uint32_t, kSimd128Size>(this, Vd, Vm, Vn, ge);
|
| + }
|
| + case Neon32: {
|
| + uint32_t src1[4], src2[4];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + if (ge)
|
| + src1[i] = src1[i] >= src2[i] ? 0xFFFFFFFFu : 0;
|
| + else
|
| + src1[i] = src1[i] > src2[i] ? 0xFFFFFFFFu : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4758,15 +4765,36 @@
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
| bool min = instr->Bit(4) != 0;
|
| switch (size) {
|
| - case Neon8:
|
| - MinMax<uint8_t, kSimd128Size>(this, Vd, Vm, Vn, min);
|
| + case Neon8: {
|
| + uint8_t src1[16], src2[16];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 16; i++) {
|
| + src1[i] = MinMax(src1[i], src2[i], min);
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon16:
|
| - MinMax<uint16_t, kSimd128Size>(this, Vd, Vm, Vn, min);
|
| + }
|
| + case Neon16: {
|
| + uint16_t src1[8], src2[8];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + src1[i] = MinMax(src1[i], src2[i], min);
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon32:
|
| - MinMax<uint32_t, kSimd128Size>(this, Vd, Vm, Vn, min);
|
| + }
|
| + case Neon32: {
|
| + uint32_t src1[4], src2[4];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + src1[i] = MinMax(src1[i], src2[i], min);
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4778,15 +4806,36 @@
|
| // vsub.size Qd, Qm, Qn.
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
| switch (size) {
|
| - case Neon8:
|
| - Sub<uint8_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + case Neon8: {
|
| + uint8_t src1[16], src2[16];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 16; i++) {
|
| + src1[i] -= src2[i];
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon16:
|
| - Sub<uint16_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + }
|
| + case Neon16: {
|
| + uint16_t src1[8], src2[8];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + src1[i] -= src2[i];
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon32:
|
| - Sub<uint32_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + }
|
| + case Neon32: {
|
| + uint32_t src1[4], src2[4];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + src1[i] -= src2[i];
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4795,15 +4844,36 @@
|
| // vceq.size Qd, Qm, Qn.
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
| switch (size) {
|
| - case Neon8:
|
| - CompareEqual<uint8_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + case Neon8: {
|
| + uint8_t src1[16], src2[16];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 16; i++) {
|
| + src1[i] = (src1[i] == src2[i]) ? 0xFFu : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon16:
|
| - CompareEqual<uint16_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + }
|
| + case Neon16: {
|
| + uint16_t src1[8], src2[8];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + src1[i] = (src1[i] == src2[i]) ? 0xFFFFu : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| - case Neon32:
|
| - CompareEqual<uint32_t, kSimd128Size>(this, Vd, Vm, Vn);
|
| + }
|
| + case Neon32: {
|
| + uint32_t src1[4], src2[4];
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + src1[i] = (src1[i] == src2[i]) ? 0xFFFFFFFFu : 0;
|
| + }
|
| + set_q_register(Vd, src1);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4816,15 +4886,37 @@
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
|
| bool min = instr->Bit(4) != 0;
|
| switch (size) {
|
| - case Neon8:
|
| - PairwiseMinMax<uint8_t>(this, Vd, Vm, Vn, min);
|
| + case Neon8: {
|
| + uint8_t dst[8], src1[8], src2[8];
|
| + get_d_register(Vn, src1);
|
| + get_d_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + dst[i + 0] = MinMax(src1[i * 2], src1[i * 2 + 1], min);
|
| + dst[i + 4] = MinMax(src2[i * 2], src2[i * 2 + 1], min);
|
| + }
|
| + set_d_register(Vd, dst);
|
| break;
|
| - case Neon16:
|
| - PairwiseMinMax<uint16_t>(this, Vd, Vm, Vn, min);
|
| + }
|
| + case Neon16: {
|
| + uint16_t dst[4], src1[4], src2[4];
|
| + get_d_register(Vn, src1);
|
| + get_d_register(Vm, src2);
|
| + for (int i = 0; i < 2; i++) {
|
| + dst[i + 0] = MinMax(src1[i * 2], src1[i * 2 + 1], min);
|
| + dst[i + 2] = MinMax(src2[i * 2], src2[i * 2 + 1], min);
|
| + }
|
| + set_d_register(Vd, dst);
|
| break;
|
| - case Neon32:
|
| - PairwiseMinMax<uint32_t>(this, Vd, Vm, Vn, min);
|
| + }
|
| + case Neon32: {
|
| + uint32_t dst[2], src1[2], src2[2];
|
| + get_d_register(Vn, src1);
|
| + get_d_register(Vm, src2);
|
| + dst[0] = MinMax(src1[0], src1[1], min);
|
| + dst[1] = MinMax(src2[0], src2[1], min);
|
| + set_d_register(Vd, dst);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
| @@ -4835,12 +4927,12 @@
|
| if (instr->Bit(21) == 0 && instr->Bit(6) == 1 && instr->Bit(4) == 1) {
|
| // vmul.f32 Qd, Qn, Qm
|
| float src1[4], src2[4];
|
| - get_neon_register(Vn, src1);
|
| - get_neon_register(Vm, src2);
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| for (int i = 0; i < 4; i++) {
|
| src1[i] = src1[i] * src2[i];
|
| }
|
| - set_neon_register(Vd, src1);
|
| + set_q_register(Vd, src1);
|
| } else {
|
| UNIMPLEMENTED();
|
| }
|
| @@ -4851,8 +4943,8 @@
|
| // vcge/vcgt.f32 Qd, Qm, Qn
|
| bool ge = instr->Bit(21) == 0;
|
| float src1[4], src2[4];
|
| - get_neon_register(Vn, src1);
|
| - get_neon_register(Vm, src2);
|
| + get_q_register(Vn, src1);
|
| + get_q_register(Vm, src2);
|
| uint32_t dst[4];
|
| for (int i = 0; i < 4; i++) {
|
| if (ge) {
|
| @@ -4861,7 +4953,7 @@
|
| dst[i] = src1[i] > src2[i] ? 0xFFFFFFFFu : 0;
|
| }
|
| }
|
| - set_neon_register(Vd, dst);
|
| + set_q_register(Vd, dst);
|
| } else {
|
| UNIMPLEMENTED();
|
| }
|
| @@ -4902,7 +4994,7 @@
|
| int Vd = instr->VFPDRegValue(kSimd128Precision);
|
| int Vm = instr->VFPMRegValue(kSimd128Precision);
|
| uint32_t q_data[4];
|
| - get_neon_register(Vm, q_data);
|
| + get_q_register(Vm, q_data);
|
| int op = instr->Bits(8, 7);
|
| for (int i = 0; i < 4; i++) {
|
| switch (op) {
|
| @@ -4928,7 +5020,7 @@
|
| break;
|
| }
|
| }
|
| - set_neon_register(Vd, q_data);
|
| + set_q_register(Vd, q_data);
|
| } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 7) == 0) {
|
| if (instr->Bit(6) == 0) {
|
| // vswp Dd, Dm.
|
| @@ -4944,10 +5036,10 @@
|
| uint32_t dval[4], mval[4];
|
| int vd = instr->VFPDRegValue(kSimd128Precision);
|
| int vm = instr->VFPMRegValue(kSimd128Precision);
|
| - get_neon_register(vd, dval);
|
| - get_neon_register(vm, mval);
|
| - set_neon_register(vm, dval);
|
| - set_neon_register(vd, mval);
|
| + get_q_register(vd, dval);
|
| + get_q_register(vm, mval);
|
| + set_q_register(vm, dval);
|
| + set_q_register(vd, mval);
|
| }
|
| } else if (instr->Bits(11, 7) == 0x18) {
|
| // vdup.32 Qd, Sm.
|
| @@ -4957,15 +5049,15 @@
|
| uint32_t s_data = get_s_register(vm * 2 + index);
|
| uint32_t q_data[4];
|
| for (int i = 0; i < 4; i++) q_data[i] = s_data;
|
| - set_neon_register(vd, q_data);
|
| + set_q_register(vd, q_data);
|
| } else if (instr->Bits(19, 16) == 0 && instr->Bits(11, 6) == 0x17) {
|
| // vmvn Qd, Qm.
|
| int vd = instr->VFPDRegValue(kSimd128Precision);
|
| int vm = instr->VFPMRegValue(kSimd128Precision);
|
| uint32_t q_data[4];
|
| - get_neon_register(vm, q_data);
|
| + get_q_register(vm, q_data);
|
| for (int i = 0; i < 4; i++) q_data[i] = ~q_data[i];
|
| - set_neon_register(vd, q_data);
|
| + set_q_register(vd, q_data);
|
| } else if (instr->Bits(11, 10) == 0x2) {
|
| // vtb[l,x] Dd, <list>, Dm.
|
| int vd = instr->VFPDRegValue(kDoublePrecision);
|
| @@ -4990,79 +5082,108 @@
|
| }
|
| }
|
| set_d_register(vd, &result);
|
| - } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 8) == 0x1) {
|
| + } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 8) == 0x1 &&
|
| + instr->Bit(6) == 1) {
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(19, 18));
|
| - if (instr->Bit(6) == 0) {
|
| - int Vd = instr->VFPDRegValue(kDoublePrecision);
|
| - int Vm = instr->VFPMRegValue(kDoublePrecision);
|
| - if (instr->Bit(7) == 1) {
|
| - // vzip.<size> Dd, Dm.
|
| - switch (size) {
|
| - case Neon8:
|
| - Zip<uint8_t, kDoubleSize>(this, Vd, Vm);
|
| - break;
|
| - case Neon16:
|
| - Zip<uint16_t, kDoubleSize>(this, Vd, Vm);
|
| - break;
|
| - case Neon32:
|
| - Zip<uint32_t, kDoubleSize>(this, Vd, Vm);
|
| - break;
|
| - default:
|
| - UNREACHABLE();
|
| - break;
|
| - }
|
| - } else {
|
| - // vuzp.<size> Dd, Dm.
|
| - switch (size) {
|
| - case Neon8:
|
| - Unzip<uint8_t, kDoubleSize>(this, Vd, Vm);
|
| - break;
|
| - case Neon16:
|
| - Unzip<uint16_t, kDoubleSize>(this, Vd, Vm);
|
| - break;
|
| - case Neon32:
|
| - Unzip<uint32_t, kDoubleSize>(this, Vd, Vm);
|
| - break;
|
| - default:
|
| - UNREACHABLE();
|
| - break;
|
| - }
|
| + int Vd = instr->VFPDRegValue(kSimd128Precision);
|
| + int Vm = instr->VFPMRegValue(kSimd128Precision);
|
| + if (instr->Bit(7) == 1) {
|
| + // vzip.<size> Qd, Qm.
|
| + switch (size) {
|
| + case Neon8: {
|
| + uint8_t src1[16], src2[16], dst1[16], dst2[16];
|
| + get_q_register(Vd, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + dst1[i * 2] = src1[i];
|
| + dst1[i * 2 + 1] = src2[i];
|
| + dst2[i * 2] = src1[i + 8];
|
| + dst2[i * 2 + 1] = src2[i + 8];
|
| + }
|
| + set_q_register(Vd, dst1);
|
| + set_q_register(Vm, dst2);
|
| + break;
|
| + }
|
| + case Neon16: {
|
| + uint16_t src1[8], src2[8], dst1[8], dst2[8];
|
| + get_q_register(Vd, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + dst1[i * 2] = src1[i];
|
| + dst1[i * 2 + 1] = src2[i];
|
| + dst2[i * 2] = src1[i + 4];
|
| + dst2[i * 2 + 1] = src2[i + 4];
|
| + }
|
| + set_q_register(Vd, dst1);
|
| + set_q_register(Vm, dst2);
|
| + break;
|
| + }
|
| + case Neon32: {
|
| + uint32_t src1[4], src2[4], dst1[4], dst2[4];
|
| + get_q_register(Vd, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 2; i++) {
|
| + dst1[i * 2] = src1[i];
|
| + dst1[i * 2 + 1] = src2[i];
|
| + dst2[i * 2] = src1[i + 2];
|
| + dst2[i * 2 + 1] = src2[i + 2];
|
| + }
|
| + set_q_register(Vd, dst1);
|
| + set_q_register(Vm, dst2);
|
| + break;
|
| + }
|
| + default:
|
| + UNREACHABLE();
|
| + break;
|
| }
|
| } else {
|
| - int Vd = instr->VFPDRegValue(kSimd128Precision);
|
| - int Vm = instr->VFPMRegValue(kSimd128Precision);
|
| - if (instr->Bit(7) == 1) {
|
| - // vzip.<size> Qd, Qm.
|
| - switch (size) {
|
| - case Neon8:
|
| - Zip<uint8_t, kSimd128Size>(this, Vd, Vm);
|
| - break;
|
| - case Neon16:
|
| - Zip<uint16_t, kSimd128Size>(this, Vd, Vm);
|
| - break;
|
| - case Neon32:
|
| - Zip<uint32_t, kSimd128Size>(this, Vd, Vm);
|
| - break;
|
| - default:
|
| - UNREACHABLE();
|
| - break;
|
| - }
|
| - } else {
|
| - // vuzp.<size> Qd, Qm.
|
| - switch (size) {
|
| - case Neon8:
|
| - Unzip<uint8_t, kSimd128Size>(this, Vd, Vm);
|
| - break;
|
| - case Neon16:
|
| - Unzip<uint16_t, kSimd128Size>(this, Vd, Vm);
|
| - break;
|
| - case Neon32:
|
| - Unzip<uint32_t, kSimd128Size>(this, Vd, Vm);
|
| - break;
|
| - default:
|
| - UNREACHABLE();
|
| - break;
|
| - }
|
| + // vuzp.<size> Qd, Qm.
|
| + switch (size) {
|
| + case Neon8: {
|
| + uint8_t src1[16], src2[16], dst1[16], dst2[16];
|
| + get_q_register(Vd, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 8; i++) {
|
| + dst1[i] = src1[i * 2];
|
| + dst1[i + 8] = src2[i * 2];
|
| + dst2[i] = src1[i * 2 + 1];
|
| + dst2[i + 8] = src2[i * 2 + 1];
|
| + }
|
| + set_q_register(Vd, dst1);
|
| + set_q_register(Vm, dst2);
|
| + break;
|
| + }
|
| + case Neon16: {
|
| + uint16_t src1[8], src2[8], dst1[8], dst2[8];
|
| + get_q_register(Vd, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 4; i++) {
|
| + dst1[i] = src1[i * 2];
|
| + dst1[i + 4] = src2[i * 2];
|
| + dst2[i] = src1[i * 2 + 1];
|
| + dst2[i + 4] = src2[i * 2 + 1];
|
| + }
|
| + set_q_register(Vd, dst1);
|
| + set_q_register(Vm, dst2);
|
| + break;
|
| + }
|
| + case Neon32: {
|
| + uint32_t src1[4], src2[4], dst1[4], dst2[4];
|
| + get_q_register(Vd, src1);
|
| + get_q_register(Vm, src2);
|
| + for (int i = 0; i < 2; i++) {
|
| + dst1[i] = src1[i * 2];
|
| + dst1[i + 2] = src2[i * 2];
|
| + dst2[i] = src1[i * 2 + 1];
|
| + dst2[i + 2] = src2[i * 2 + 1];
|
| + }
|
| + set_q_register(Vd, dst1);
|
| + set_q_register(Vm, dst2);
|
| + break;
|
| + }
|
| + default:
|
| + UNREACHABLE();
|
| + break;
|
| }
|
| }
|
| } else if (instr->Bits(17, 16) == 0 && instr->Bits(11, 9) == 0) {
|
| @@ -5076,32 +5197,32 @@
|
| case Neon16: {
|
| DCHECK_EQ(Neon8, size);
|
| uint8_t src[16];
|
| - get_neon_register(Vm, src);
|
| + get_q_register(Vm, src);
|
| for (int i = 0; i < 16; i += 2) {
|
| std::swap(src[i], src[i + 1]);
|
| }
|
| - set_neon_register(Vd, src);
|
| + set_q_register(Vd, src);
|
| break;
|
| }
|
| case Neon32: {
|
| switch (size) {
|
| case Neon16: {
|
| uint16_t src[8];
|
| - get_neon_register(Vm, src);
|
| + get_q_register(Vm, src);
|
| for (int i = 0; i < 8; i += 2) {
|
| std::swap(src[i], src[i + 1]);
|
| }
|
| - set_neon_register(Vd, src);
|
| + set_q_register(Vd, src);
|
| break;
|
| }
|
| case Neon8: {
|
| uint8_t src[16];
|
| - get_neon_register(Vm, src);
|
| + get_q_register(Vm, src);
|
| for (int i = 0; i < 4; i++) {
|
| std::swap(src[i * 4], src[i * 4 + 3]);
|
| std::swap(src[i * 4 + 1], src[i * 4 + 2]);
|
| }
|
| - set_neon_register(Vd, src);
|
| + set_q_register(Vd, src);
|
| break;
|
| }
|
| default:
|
| @@ -5114,30 +5235,30 @@
|
| switch (size) {
|
| case Neon32: {
|
| uint32_t src[4];
|
| - get_neon_register(Vm, src);
|
| + get_q_register(Vm, src);
|
| std::swap(src[0], src[1]);
|
| std::swap(src[2], src[3]);
|
| - set_neon_register(Vd, src);
|
| + set_q_register(Vd, src);
|
| break;
|
| }
|
| case Neon16: {
|
| uint16_t src[8];
|
| - get_neon_register(Vm, src);
|
| + get_q_register(Vm, src);
|
| for (int i = 0; i < 4; i++) {
|
| std::swap(src[i * 4], src[i * 4 + 3]);
|
| std::swap(src[i * 4 + 1], src[i * 4 + 2]);
|
| }
|
| - set_neon_register(Vd, src);
|
| + set_q_register(Vd, src);
|
| break;
|
| }
|
| case Neon8: {
|
| uint8_t src[16];
|
| - get_neon_register(Vm, src);
|
| + get_q_register(Vm, src);
|
| for (int i = 0; i < 4; i++) {
|
| std::swap(src[i], src[7 - i]);
|
| std::swap(src[i + 8], src[15 - i]);
|
| }
|
| - set_neon_register(Vd, src);
|
| + set_q_register(Vd, src);
|
| break;
|
| }
|
| default:
|
| @@ -5150,44 +5271,48 @@
|
| UNREACHABLE();
|
| break;
|
| }
|
| - } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 7) == 0x1) {
|
| + } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 6) == 0x3) {
|
| + int Vd = instr->VFPDRegValue(kSimd128Precision);
|
| + int Vm = instr->VFPMRegValue(kSimd128Precision);
|
| NeonSize size = static_cast<NeonSize>(instr->Bits(19, 18));
|
| - if (instr->Bit(6) == 0) {
|
| - int Vd = instr->VFPDRegValue(kDoublePrecision);
|
| - int Vm = instr->VFPMRegValue(kDoublePrecision);
|
| - // vtrn.<size> Dd, Dm.
|
| - switch (size) {
|
| - case Neon8:
|
| - Transpose<uint8_t, kDoubleSize>(this, Vd, Vm);
|
| - break;
|
| - case Neon16:
|
| - Transpose<uint16_t, kDoubleSize>(this, Vd, Vm);
|
| - break;
|
| - case Neon32:
|
| - Transpose<uint32_t, kDoubleSize>(this, Vd, Vm);
|
| - break;
|
| - default:
|
| - UNREACHABLE();
|
| - break;
|
| - }
|
| - } else {
|
| - int Vd = instr->VFPDRegValue(kSimd128Precision);
|
| - int Vm = instr->VFPMRegValue(kSimd128Precision);
|
| - // vtrn.<size> Qd, Qm.
|
| - switch (size) {
|
| - case Neon8:
|
| - Transpose<uint8_t, kSimd128Size>(this, Vd, Vm);
|
| - break;
|
| - case Neon16:
|
| - Transpose<uint16_t, kSimd128Size>(this, Vd, Vm);
|
| - break;
|
| - case Neon32:
|
| - Transpose<uint32_t, kSimd128Size>(this, Vd, Vm);
|
| - break;
|
| - default:
|
| - UNREACHABLE();
|
| - break;
|
| - }
|
| + // vtrn.<size> Qd, Qm.
|
| + switch (size) {
|
| + case Neon8: {
|
| + uint8_t src[16], dst[16];
|
| + get_q_register(Vd, dst);
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 8; i++) {
|
| + std::swap(dst[2 * i + 1], src[2 * i]);
|
| + }
|
| + set_q_register(Vd, dst);
|
| + set_q_register(Vm, src);
|
| + break;
|
| + }
|
| + case Neon16: {
|
| + uint16_t src[8], dst[8];
|
| + get_q_register(Vd, dst);
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 4; i++) {
|
| + std::swap(dst[2 * i + 1], src[2 * i]);
|
| + }
|
| + set_q_register(Vd, dst);
|
| + set_q_register(Vm, src);
|
| + break;
|
| + }
|
| + case Neon32: {
|
| + uint32_t src[4], dst[4];
|
| + get_q_register(Vd, dst);
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 2; i++) {
|
| + std::swap(dst[2 * i + 1], src[2 * i]);
|
| + }
|
| + set_q_register(Vd, dst);
|
| + set_q_register(Vm, src);
|
| + break;
|
| + }
|
| + default:
|
| + UNREACHABLE();
|
| + break;
|
| }
|
| } else if (instr->Bits(17, 16) == 0x1 && instr->Bit(11) == 0) {
|
| int Vd = instr->VFPDRegValue(kSimd128Precision);
|
| @@ -5198,23 +5323,41 @@
|
| if (instr->Bit(10) != 0) {
|
| // floating point (clear sign bits)
|
| uint32_t src[4];
|
| - get_neon_register(Vm, src);
|
| + get_q_register(Vm, src);
|
| for (int i = 0; i < 4; i++) {
|
| src[i] &= ~0x80000000;
|
| }
|
| - set_neon_register(Vd, src);
|
| + set_q_register(Vd, src);
|
| } else {
|
| // signed integer
|
| switch (size) {
|
| - case Neon8:
|
| - Abs<int8_t, kSimd128Size>(this, Vd, Vm);
|
| + case Neon8: {
|
| + int8_t src[16];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 16; i++) {
|
| + src[i] = std::abs(src[i]);
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| - case Neon16:
|
| - Abs<int16_t, kSimd128Size>(this, Vd, Vm);
|
| + }
|
| + case Neon16: {
|
| + int16_t src[8];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 8; i++) {
|
| + src[i] = std::abs(src[i]);
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| - case Neon32:
|
| - Abs<int32_t, kSimd128Size>(this, Vd, Vm);
|
| + }
|
| + case Neon32: {
|
| + int32_t src[4];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 4; i++) {
|
| + src[i] = std::abs(src[i]);
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| + }
|
| default:
|
| UNIMPLEMENTED();
|
| break;
|
| @@ -5225,23 +5368,40 @@
|
| if (instr->Bit(10) != 0) {
|
| // floating point (toggle sign bits)
|
| uint32_t src[4];
|
| - get_neon_register(Vm, src);
|
| + get_q_register(Vm, src);
|
| for (int i = 0; i < 4; i++) {
|
| src[i] ^= 0x80000000;
|
| }
|
| - set_neon_register(Vd, src);
|
| + set_q_register(Vd, src);
|
| } else {
|
| // signed integer
|
| switch (size) {
|
| - case Neon8:
|
| - Neg<int8_t, kSimd128Size>(this, Vd, Vm);
|
| + case Neon8: {
|
| + int8_t src[16];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 16; i++) {
|
| + src[i] = -src[i];
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| + }
|
| case Neon16:
|
| - Neg<int16_t, kSimd128Size>(this, Vd, Vm);
|
| + int16_t src[8];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 8; i++) {
|
| + src[i] = -src[i];
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| - case Neon32:
|
| - Neg<int32_t, kSimd128Size>(this, Vd, Vm);
|
| + case Neon32: {
|
| + int32_t src[4];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 4; i++) {
|
| + src[i] = -src[i];
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| + }
|
| default:
|
| UNIMPLEMENTED();
|
| break;
|
| @@ -5255,7 +5415,7 @@
|
| int Vd = instr->VFPDRegValue(kSimd128Precision);
|
| int Vm = instr->VFPMRegValue(kSimd128Precision);
|
| uint32_t src[4];
|
| - get_neon_register(Vm, src);
|
| + get_q_register(Vm, src);
|
| if (instr->Bit(7) == 0) {
|
| for (int i = 0; i < 4; i++) {
|
| float denom = bit_cast<float>(src[i]);
|
| @@ -5273,7 +5433,7 @@
|
| src[i] = bit_cast<uint32_t>(result);
|
| }
|
| }
|
| - set_neon_register(Vd, src);
|
| + set_q_register(Vd, src);
|
| } else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 8) == 0x2 &&
|
| instr->Bits(7, 6) != 0) {
|
| // vqmovn.<type><size> Dd, Qm.
|
| @@ -5321,15 +5481,33 @@
|
| int Vm = instr->VFPMRegValue(kSimd128Precision);
|
| NeonSize ns = static_cast<NeonSize>(size / 16);
|
| switch (ns) {
|
| - case Neon8:
|
| - ShiftRight<uint8_t, kSimd128Size>(this, Vd, Vm, shift);
|
| + case Neon8: {
|
| + uint8_t src[16];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 16; i++) {
|
| + src[i] >>= shift;
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| - case Neon16:
|
| - ShiftRight<uint16_t, kSimd128Size>(this, Vd, Vm, shift);
|
| + }
|
| + case Neon16: {
|
| + uint16_t src[8];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 8; i++) {
|
| + src[i] >>= shift;
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| - case Neon32:
|
| - ShiftRight<uint32_t, kSimd128Size>(this, Vd, Vm, shift);
|
| + }
|
| + case Neon32: {
|
| + uint32_t src[4];
|
| + get_q_register(Vm, src);
|
| + for (int i = 0; i < 4; i++) {
|
| + src[i] >>= shift;
|
| + }
|
| + set_q_register(Vd, src);
|
| break;
|
| + }
|
| default:
|
| UNREACHABLE();
|
| break;
|
|
|