src/arm/lithium-codegen-arm.cc - Issue 6625084: ARM: Improved double to integer truncation....

Unified Diff: src/arm/lithium-codegen-arm.cc

Issue 6625084: ARM: Improved double to integer truncation.... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 9 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/arm/lithium-codegen-arm.cc

===================================================================

--- src/arm/lithium-codegen-arm.cc (revision 7092)

+++ src/arm/lithium-codegen-arm.cc (working copy)

@@ -2716,6 +2716,188 @@

}

+void LCodeGen::TryVcvtTruncation(Register result,

+ DwVfpRegister double_input,

+ Register saved_fpscr,

+ Register current_fpscr,

+ SwVfpRegister single_scratch,

+ TruncationType type,

+ Label* success) {

+ // Cumulative exception flags.

+ __ bic(current_fpscr, saved_fpscr, Operand(kVFPExceptionMask |

+ kVFPFlushToZeroMask));

+ __ vmsr(current_fpscr);

+ // Try a standard vfp floating-point to integer truncation, using the

+ // default 'round to zero' mode.

+ if (type == kSignedTruncation) {

+ __ vcvt_s32_f64(single_scratch, double_input);

+ } else {

+ __ vcvt_u32_f64(single_scratch, double_input);

+ }

+ // Retrieve FPSCR and check for vfp exceptions.

+ __ vmrs(current_fpscr);

+ __ tst(current_fpscr, Operand(kVFPExceptionMask));

+ // Load the result and restore the FPSCR.

+ __ vmov(result, single_scratch);

+ // Restore the saved FPSCR.

+ __ vmsr(saved_fpscr);

+ // If no vfp exceptions were raised we are done. Otherwise fall through.

+ __ b(eq, success);

+// The truncation process is:

+// 1: Try to truncate using VFP floating-point to integer vcvt instructions.

+// a: Try to truncate to a signed int.

+// b: If that fails, try to truncate to an unsigned int.

+// 2: If that fails, try to bring back the input value in the 32bit int range.

+// If we succeed jump backward to let vcvt instructions truncate the value.

+// 3: If we could not bring back the value to the int32 range, check for special

+// cases.

+// 4: If that also fails, fall through. The following code should handle the

+// failure, probably by deoptimizing.

+void LCodeGen::EmitECMATruncate(Register result,

Karl Klose 2011/03/09 10:37:51 As fschneider suggested, it would be good to move

Alexandre 2011/03/15 08:45:39 I first moved everything to a stub. Then I refact

+ Register scratch1,

+ Register scratch2,

+ DwVfpRegister double_input,

+ DwVfpRegister double_scratch1,

+ DwVfpRegister double_scratch2,

+ Label* done) {

+ ASSERT(!scratch1.is(result));

+ ASSERT(!scratch2.is(result));

+ ASSERT(!scratch1.is(scratch2));

+ ASSERT(!double_scratch1.is(double_input));

+ ASSERT(!double_scratch2.is(double_input));

+ ASSERT(!double_scratch1.is(double_scratch2));

+ Register prev_fpscr = scratch1;

+ Register curr_fpscr = scratch2;

+ scratch1 = no_reg;

+ scratch2 = no_reg;

+ SwVfpRegister single_scratch = double_scratch2.low();

+ Label retry, check_special_cases;

+ // Save the current FPSCR.

+ __ vmrs(prev_fpscr);

+ __ bind(&retry);

+ // Try standard vfp floating-point to integer truncations, using the

+ // default 'round to zero' mode.

Søren Thygesen Gjesse 2011/03/08 16:15:03 Drive-by: How fast is the VFP rounding? Maybe just

Karl Klose 2011/03/09 10:37:51 We should measure later, if bit-fiddeling code as

Alexandre 2011/03/15 08:45:39 I initially thought that the vfp would be faster.

+ TryVcvtTruncation(result,

+ double_input,

+ prev_fpscr,

+ curr_fpscr,

+ single_scratch,

+ kSignedTruncation,

+ done);

+ // Exceptions were raised. Try an unsigned conversion.

+ TryVcvtTruncation(result,

+ double_input,

+ prev_fpscr,

+ curr_fpscr,

+ single_scratch,

+ kUnsignedTruncation,

+ done);

+ // Standard conversion did not work. Try to handle manually.

+ // Clear vfp cumulative exception flags.

+ __ bic(curr_fpscr, curr_fpscr, Operand(kVFPExceptionMask));

+ __ vmsr(curr_fpscr);

+ // The truncating conversion is invariant modulo 2^32.

+ // If we are lucky, we can easily bring the input value to the

+ // [-2^32, 2^32] range.

+ Label positive, in_two_31_range;

+ const double two_31_value = 2147483648.0;

Karl Klose 2011/03/09 10:37:51 Constants should be formatted as follows: kTwo31Va

Alexandre 2011/03/15 08:45:39 Done.

+ const double two_32_value = 4294967296.0;

+ // Start bringing the input value to the [-2^32, 2^32] range.

+ DwVfpRegister two_32 = double_scratch2;

+ __ vmov(two_32, two_32_value);

+ __ vdiv(double_scratch1, double_input, two_32);

+ __ vcvt_s32_f64(double_scratch1.low(), double_scratch1);

+ __ vcvt_f64_s32(double_scratch1, double_scratch1.low());

+ __ vmul(double_scratch1, double_scratch1, two_32);

+ // Test for vfp exceptions.

+ __ vmrs(curr_fpscr);

+ __ tst(curr_fpscr, Operand(kVFPExceptionMask));

+ // The following code won't work if vfp exceptions were raised.

+ // (Overflow is raised for high values, infinity. Invalid exception for NaN.)

+ __ b(ne, &check_special_cases);

+ // Perform the subtraction after the branch to preserve the input.

+ __ vsub(double_input, double_input, double_scratch1);

+ // double_input: value brought back to [-2^32, 2^32].

+ // Get the value rounded toward 0.

+ DwVfpRegister two_31 = double_scratch2;

+ __ vabs(double_scratch1, double_input);

+ __ vmov(two_31, two_31_value);

+ __ vcmp(double_scratch1, two_31);

+ __ vmrs(pc);

+ __ b(lt, &in_two_31_range);

+ // The value is in the [-2^32, -2^31] U [2^31, 2^32] range.

+ // Add or subtrct 2^31 to easily round it toward zero.

Karl Klose 2011/03/09 10:37:51 subtrct -> subtract.

Alexandre 2011/03/15 08:45:39 Done.

+ // Push negative values below -2^31 to the positive range to let vcvt_u32_f64

+ // handle the conversion. (For negative value we add 2^31 to easily round,

+ // then add 2^31 again instead of subtracting. This works because the

+ // operation is invariant modulo 2^32.)

+ __ vcmp(double_input, 0.0);

+ __ vmrs(pc);

+ __ vadd(double_input, double_input, two_31, lt);

+ __ vsub(double_input, double_input, two_31, ge);

+ __ vcvt_s32_f64(double_input.low(), double_input);

Karl Klose 2011/03/09 10:37:51 Should this code not use vcvt_u32_f64 as stated in

Alexandre 2011/03/15 08:45:39 No it should not. I updated the comment before to

+ __ vcvt_f64_s32(double_input, double_input.low());

+ __ vadd(double_input, double_input, two_31);

+ __ b(&retry);

+ __ bind(&in_two_31_range);

+ // Round the value toward zero and jump back to let the standard

+ // code handle the conversion.

+ __ vcvt_s32_f64(double_input.low(), double_input);

+ __ vcvt_f64_s32(double_input, double_input.low());

+ __ b(&retry);

+ // We never fall through to here.

+ // We always jump to 'done' if conversion was successful.

+ if (FLAG_debug_code) {

+ __ Abort("We should never fall through.");

+ }

+ // Check for a high exponent, infinity, and NaN, which should all return 0.

+ // * If the unbiased exponent is greater than 52 + 32 = 84 then all mantissa

+ // bits are shifted out of the 32bit integer range and the result is 0.

+ // * NaN and Infinity have an exponent of 0x7ff, so the test below will also

+ // detect them.

+ __ bind(&check_special_cases);

+ scratch2 = curr_fpscr;

+ curr_fpscr = no_reg;

+ // Get exponent alone in scratch2.

+ __ vmov(scratch2, double_input.high());

+ __ Ubfx(scratch2,

+ scratch2,

+ HeapNumber::kExponentShift,

+ HeapNumber::kExponentBits);

+ const int32_t big_exp = 84;

+ __ cmp(scratch2, Operand(HeapNumber::kExponentBias + big_exp));

+ __ mov(result, Operand(0));

+ __ b(ge, done);

+ // We could not handle the truncation manually.

+ // Restore the FPSCR and fall through.

+ __ vmsr(prev_fpscr);

void LCodeGen::DoMathLog(LUnaryMathOperation* instr) {

ASSERT(ToDoubleRegister(instr->result()).is(d2));

TranscendentalCacheStub stub(TranscendentalCache::LOG,

@@ -3281,20 +3463,33 @@

void LCodeGen::DoDeferredTaggedToI(LTaggedToI* instr) {

- Label done;

- Register scratch = scratch0();

- DoubleRegister dbl_scratch = d0;

- SwVfpRegister flt_scratch = s0;

- DoubleRegister dbl_tmp = ToDoubleRegister(instr->TempAt(0));

+ Register scratch1 = scratch0();

+ Register scratch2 = ToRegister(instr->TempAt(0));

+ SwVfpRegister single_scratch = s0;

+ DwVfpRegister double_scratch1 = d0;

+ DwVfpRegister double_scratch2 = ToDoubleRegister(instr->TempAt(1));

+ DwVfpRegister double_scratch3 = ToDoubleRegister(instr->TempAt(2));

+ ASSERT(!scratch1.is(input_reg));

+ ASSERT(!scratch2.is(input_reg));

+ ASSERT(!scratch2.is(scratch1));

+ ASSERT(!double_scratch1.is(double_scratch2));

+ ASSERT(!double_scratch1.is(double_scratch3));

+ ASSERT(!double_scratch2.is(double_scratch3));

+ Label done;

// Heap number map check.

- __ ldr(scratch, FieldMemOperand(input_reg, HeapObject::kMapOffset));

+ __ ldr(scratch1, FieldMemOperand(input_reg, HeapObject::kMapOffset));

__ LoadRoot(ip, Heap::kHeapNumberMapRootIndex);

- __ cmp(scratch, Operand(ip));

+ __ cmp(scratch1, Operand(ip));

+ CpuFeatures::Scope scope(VFP3);

if (instr->truncating()) {

- Label heap_number;

+ // Performs a truncating conversion of a floating point number as used by

+ // the JS bitwise operations.

+ Label heap_number, success;

__ b(eq, &heap_number);

// Check for undefined. Undefined is converted to zero for truncating

// conversions.

@@ -3305,36 +3500,42 @@

__ b(&done);

__ bind(&heap_number);

- __ sub(ip, input_reg, Operand(kHeapObjectTag));

- __ vldr(dbl_tmp, ip, HeapNumber::kValueOffset);

- __ vcmp(dbl_tmp, 0.0); // Sets overflow bit in FPSCR flags if NaN.

- __ vcvt_s32_f64(flt_scratch, dbl_tmp);

- __ vmov(input_reg, flt_scratch); // 32-bit result of conversion.

- __ vmrs(pc); // Move vector status bits to normal status bits.

- // Overflow bit is set if dbl_tmp is Nan.

- __ cmn(input_reg, Operand(1), vc); // 0x7fffffff + 1 -> overflow.

- __ cmp(input_reg, Operand(1), vc); // 0x80000000 - 1 -> overflow.

- DeoptimizeIf(vs, instr->environment()); // Saturation may have occured.

+ DwVfpRegister double_value = double_scratch3;

+ double_scratch3 = no_dreg;

+ __ sub(scratch1, input_reg, Operand(kHeapObjectTag));

+ __ vldr(double_value, scratch1, HeapNumber::kValueOffset);

+ EmitECMATruncate(input_reg,

+ scratch1,

+ scratch2,

+ double_value,

+ double_scratch1,

+ double_scratch2,

+ &success);

+ DeoptimizeIf(al, instr->environment());

+ __ bind(&success);

} else {

// Deoptimize if we don't have a heap number.

DeoptimizeIf(ne, instr->environment());

__ sub(ip, input_reg, Operand(kHeapObjectTag));

- __ vldr(dbl_tmp, ip, HeapNumber::kValueOffset);

- __ vcvt_s32_f64(flt_scratch, dbl_tmp);

- __ vmov(input_reg, flt_scratch); // 32-bit result of conversion.

- // Non-truncating conversion means that we cannot lose bits, so we convert

- // back to check; note that using non-overlapping s and d regs would be

- // slightly faster.

- __ vcvt_f64_s32(dbl_scratch, flt_scratch);

- __ VFPCompareAndSetFlags(dbl_scratch, dbl_tmp);

- DeoptimizeIf(ne, instr->environment()); // Not equal or unordered.

+ __ vldr(double_scratch1, ip, HeapNumber::kValueOffset);

+ __ EmitVFPTruncate(kRoundToZero,

+ single_scratch,

+ double_scratch1,

+ scratch1,

+ scratch2,

+ kCheckForInexactConversion);

+ DeoptimizeIf(ne, instr->environment());

+ // Load the result.

+ __ vmov(input_reg, single_scratch);

if (instr->hydrogen()->CheckFlag(HValue::kBailoutOnMinusZero)) {

- __ tst(input_reg, Operand(input_reg));

+ __ cmp(input_reg, Operand(0));

__ b(ne, &done);

- __ vmov(lr, ip, dbl_tmp);

- __ tst(ip, Operand(1 << 31)); // Test sign bit.

+ __ vmov(scratch1, double_scratch1.high());

+ __ tst(scratch1, Operand(HeapNumber::kSignMask));

DeoptimizeIf(ne, instr->environment());

}

@@ -3377,47 +3578,46 @@

void LCodeGen::DoDoubleToI(LDoubleToI* instr) {

LOperand* input = instr->InputAt(0);

+ LOperand* result = instr->result();

ASSERT(input->IsDoubleRegister());

- LOperand* result = instr->result();

ASSERT(result->IsRegister());

- DoubleRegister double_input = ToDoubleRegister(input);

- SwVfpRegister single_scratch = double_scratch0().low();

+ DwVfpRegister double_input = ToDoubleRegister(input);

+ DwVfpRegister double_scratch1 = double_scratch0();

+ DwVfpRegister double_scratch2 = ToDoubleRegister(instr->TempAt(1));

+ SwVfpRegister single_scratch = double_scratch0().low();

- __ EmitVFPTruncate(kRoundToZero,

- single_scratch,

+ Label done;

+ if (instr->truncating()) {

+ Label success;

+ EmitECMATruncate(result_reg,

+ scratch1,

+ scratch2,

double_input,

- scratch1,

- scratch2);

- // Deoptimize if we had a vfp invalid exception.

- DeoptimizeIf(ne, instr->environment());

- // Retrieve the result.

- __ vmov(result_reg, single_scratch);

- if (!instr->truncating()) {

- // Convert result back to double and compare with input

- // to check if the conversion was exact.

- __ vmov(single_scratch, result_reg);

- __ vcvt_f64_s32(double_scratch0(), single_scratch);

- __ VFPCompareAndSetFlags(double_scratch0(), double_input);

+ double_scratch1,

+ double_scratch2,

+ &success);

+ DeoptimizeIf(al, instr->environment());

+ __ bind(&success);

+ } else {

+ VFPRoundingMode rounding_mode = kRoundToMinusInf;

+ __ EmitVFPTruncate(rounding_mode,

+ single_scratch,

+ double_input,

+ scratch1,

+ scratch2,

+ kCheckForInexactConversion);

+ // Deoptimize if we had a vfp invalid exception,

+ // including inexact operation.

DeoptimizeIf(ne, instr->environment());

- if (instr->hydrogen()->CheckFlag(HValue::kBailoutOnMinusZero)) {

- Label done;

- __ cmp(result_reg, Operand(0));

- __ b(ne, &done);

- // Check for -0.

- __ vmov(scratch1, double_input.high());

- __ tst(scratch1, Operand(HeapNumber::kSignMask));

- DeoptimizeIf(ne, instr->environment());

- __ bind(&done);

- }

+ // Retrieve the result.

+ __ vmov(result_reg, single_scratch);

}

+ __ bind(&done);

}

« src/arm/lithium-codegen-arm.h ('K') | « src/arm/lithium-codegen-arm.h ('k') | src/arm/simulator-arm.cc » ('j') | src/arm/simulator-arm.cc » ('J')