 Chromium Code Reviews
 Chromium Code Reviews Issue 1860001:
  X64: Port inline transcendental cache to X64.  (Closed)
    
  
    Issue 1860001:
  X64: Port inline transcendental cache to X64.  (Closed) 
  | Index: src/x64/codegen-x64.cc | 
| diff --git a/src/x64/codegen-x64.cc b/src/x64/codegen-x64.cc | 
| index 9575a294eba7d7d54a71076ac679318e65131722..5f65c7f7a273e3571a7d559d022c95fa9377fcb9 100644 | 
| --- a/src/x64/codegen-x64.cc | 
| +++ b/src/x64/codegen-x64.cc | 
| @@ -4517,19 +4517,19 @@ void CodeGenerator::GenerateCallFunction(ZoneList<Expression*>* args) { | 
| void CodeGenerator::GenerateMathSin(ZoneList<Expression*>* args) { | 
| ASSERT_EQ(args->length(), 1); | 
| - // Load the argument on the stack and jump to the runtime. | 
| Load(args->at(0)); | 
| - Result answer = frame_->CallRuntime(Runtime::kMath_sin, 1); | 
| - frame_->Push(&answer); | 
| + TranscendentalCacheStub stub(TranscendentalCache::SIN); | 
| + Result result = frame_->CallStub(&stub, 1); | 
| + frame_->Push(&result); | 
| } | 
| void CodeGenerator::GenerateMathCos(ZoneList<Expression*>* args) { | 
| ASSERT_EQ(args->length(), 1); | 
| - // Load the argument on the stack and jump to the runtime. | 
| Load(args->at(0)); | 
| - Result answer = frame_->CallRuntime(Runtime::kMath_cos, 1); | 
| - frame_->Push(&answer); | 
| + TranscendentalCacheStub stub(TranscendentalCache::COS); | 
| + Result result = frame_->CallStub(&stub, 1); | 
| + frame_->Push(&result); | 
| } | 
| @@ -7397,6 +7397,218 @@ bool CodeGenerator::FoldConstantSmis(Token::Value op, int left, int right) { | 
| // End of CodeGenerator implementation. | 
| +void TranscendentalCacheStub::Generate(MacroAssembler* masm) { | 
| + // Input on stack: | 
| + // rsp[8]: argument (should be number). | 
| + // rsp[0]: return address. | 
| + // Test that eax is a number. | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
eax -> rax
Move to after the label declarations?
 
Lasse Reichstein
2010/05/03 10:33:44
Done.
 | 
| + Label runtime_call; | 
| + Label runtime_call_clear_stack; | 
| + Label input_not_smi; | 
| + Label loaded; | 
| + __ movq(rax, Operand(rsp, kPointerSize)); | 
| + __ JumpIfNotSmi(rax, &input_not_smi); | 
| + // Input is a smi. Untag and load it onto the FPU stack. | 
| + // Then load the bits of the double into rbx. | 
| + ASSERT_EQ(1, kSmiTagSize); | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
Does the assert add anything when you are using th
 
Lasse Reichstein
2010/05/03 10:33:44
Removed.
 | 
| + __ SmiToInteger32(rax, rax); | 
| + __ subq(rsp, Immediate(kPointerSize)); | 
| + __ cvtlsi2sd(xmm1, rax); | 
| + __ movsd(Operand(rsp, 0), xmm1); | 
| + __ movq(rbx, xmm1); | 
| + __ movq(rdx, xmm1); | 
| + __ fld_d(Operand(rsp, 0)); | 
| + __ addq(rsp, Immediate(kPointerSize)); | 
| + __ jmp(&loaded); | 
| + | 
| + __ bind(&input_not_smi); | 
| + // Check if input is a HeapNumber. | 
| + __ Move(rbx, Factory::heap_number_map()); | 
| + __ cmpq(rbx, FieldOperand(rax, HeapObject::kMapOffset)); | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
Can we use CompareRoot here?
 
Lasse Reichstein
2010/05/03 10:33:44
We can, but in this case I don't want to. Using th
 | 
| + __ j(not_equal, &runtime_call); | 
| + // Input is a HeapNumber. Push it on the FPU stack and load its | 
| + // bits into rbx. | 
| + __ fld_d(FieldOperand(rax, HeapNumber::kValueOffset)); | 
| + __ movq(rbx, FieldOperand(rax, HeapNumber::kValueOffset)); | 
| + __ movq(rdx, rbx); | 
| + __ bind(&loaded); | 
| + // ST[0] == double value | 
| + // rbx = bits of double value. | 
| + // rdx = also bits of double value. | 
| + // Compute hash (h is 32 bits, bits are 64): | 
| + // h = h0 = bits ^ (bits >> 32); | 
| + // h ^= h >> 16; | 
| + // h ^= h >> 8; | 
| + // h = h & (cacheSize - 1); | 
| + // or h = (h0 ^ (h0 >> 8) ^ (h0 >> 16) ^ (h0 >> 24)) & cacheSize - 1 | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
add parenthesis (cacheSize - 1)?
 
Lasse Reichstein
2010/05/03 10:33:44
Done.
 | 
| + __ sar(rdx, Immediate(32)); | 
| + __ xorl(rdx, rbx); | 
| + __ movl(rcx, rdx); | 
| + __ movl(rax, rdx); | 
| + __ movl(rdi, rdx); | 
| + __ sarl(rdx, Immediate(8)); | 
| + __ sarl(rcx, Immediate(16)); | 
| + __ sarl(rax, Immediate(24)); | 
| + __ xorl(rcx, rdx); | 
| + __ xorl(rax, rdi); | 
| + __ xorl(rcx, rax); | 
| + ASSERT(IsPowerOf2(TranscendentalCache::kCacheSize)); | 
| + __ andl(rcx, Immediate(TranscendentalCache::kCacheSize - 1)); | 
| + // ST[0] == double value. | 
| + // rbx = bits of double value. | 
| + // rcx = TranscendentalCache::hash(double value). | 
| + __ movq(rax, ExternalReference::transcendental_cache_array_address()); | 
| + // rax points to cache array. | 
| + __ movq(rax, Operand(rax, type_ * sizeof(TranscendentalCache::caches_[0]))); | 
| + // rax points to the cache for the type type_. | 
| + // If NULL, the cache hasn't been initialized yet, so go through runtime. | 
| + __ testq(rax, rax); | 
| + __ j(zero, &runtime_call_clear_stack); | 
| +#ifdef DEBUG | 
| + // Check that the layout of cache elements match expectations. | 
| + { // NOLINT - doesn't like a single brace on a line. | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
Will the linter be happy with:
  {  Transcendenta
 
Lasse Reichstein
2010/05/03 10:33:44
It probably will, but I'd hate it instead. I'd rat
 | 
| + TranscendentalCache::Element test_elem[2]; | 
| + char* elem_start = reinterpret_cast<char*>(&test_elem[0]); | 
| + char* elem2_start = reinterpret_cast<char*>(&test_elem[1]); | 
| + char* elem_in0 = reinterpret_cast<char*>(&(test_elem[0].in[0])); | 
| + char* elem_in1 = reinterpret_cast<char*>(&(test_elem[0].in[1])); | 
| + char* elem_out = reinterpret_cast<char*>(&(test_elem[0].output)); | 
| + // Two uint_32's and a pointer per element. | 
| + CHECK_EQ(16, static_cast<int>(elem2_start - elem_start)); | 
| + CHECK_EQ(0, static_cast<int>(elem_in0 - elem_start)); | 
| + CHECK_EQ(kIntSize, static_cast<int>(elem_in1 - elem_start)); | 
| + CHECK_EQ(2 * kIntSize, static_cast<int>(elem_out - elem_start)); | 
| + } | 
| +#endif | 
| + // Find the address of the rcx'th entry in the cache, i.e., &rax[rcx*16]. | 
| + __ addl(rcx, rcx); | 
| + __ lea(rcx, Operand(rax, rcx, times_8, 0));\ | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
Remove '\' at end of line.
 
Lasse Reichstein
2010/05/03 10:33:44
Whoops.
 | 
| + // Check if cache matches: Double value is stored in uint32_t[2] array. | 
| + Label cache_miss; | 
| + __ cmpq(rbx, Operand(rcx, 0)); | 
| + __ j(not_equal, &cache_miss); | 
| + // Cache hit! | 
| + __ movq(rax, Operand(rcx, 2 * kIntSize)); | 
| + __ fstp(0); // Clear FPU stack. | 
| + __ ret(kPointerSize); | 
| + | 
| + __ bind(&cache_miss); | 
| + // Update cache with new value. | 
| + // We are short on registers, so use no_reg as scratch. | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
This is a left over comment from ia32, remove?
 
Lasse Reichstein
2010/05/03 10:33:44
Removed.
 | 
| + // This gives slightly larger code. | 
| + __ AllocateHeapNumber(rax, rdi, &runtime_call_clear_stack); | 
| + GenerateOperation(masm); | 
| + __ movq(Operand(rcx, 0), rbx); | 
| + __ movq(Operand(rcx, 2 * kIntSize), rax); | 
| + __ fstp_d(FieldOperand(rax, HeapNumber::kValueOffset)); | 
| + __ ret(kPointerSize); | 
| + | 
| + __ bind(&runtime_call_clear_stack); | 
| + __ fstp(0); | 
| + __ bind(&runtime_call); | 
| + __ TailCallExternalReference(ExternalReference(RuntimeFunction()), 1, 1); | 
| +} | 
| + | 
| + | 
| +Runtime::FunctionId TranscendentalCacheStub::RuntimeFunction() { | 
| + switch (type_) { | 
| + // Add more cases when necessary. | 
| + case TranscendentalCache::SIN: return Runtime::kMath_sin; | 
| + case TranscendentalCache::COS: return Runtime::kMath_cos; | 
| + default: | 
| + UNIMPLEMENTED(); | 
| + return Runtime::kAbort; | 
| + } | 
| +} | 
| + | 
| + | 
| +void TranscendentalCacheStub::GenerateOperation(MacroAssembler* masm) { | 
| + // Only free register is edi. | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
rdi
 
Lasse Reichstein
2010/05/03 10:33:44
fixed.
 | 
| + Label done; | 
| + ASSERT(type_ == TranscendentalCache::SIN || | 
| + type_ == TranscendentalCache::COS); | 
| + // More transcendental types can be added later. | 
| + | 
| + // Both fsin and fcos require arguments in the range +/-2^63 and | 
| + // return NaN for infinities and NaN. They can share all code except | 
| + // the actual fsin/fcos operation. | 
| + Label in_range; | 
| + // If argument is outside the range -2^63..2^63, fsin/cos doesn't | 
| + // work. We must reduce it to the appropriate range. | 
| + __ movq(rdi, rbx); | 
| + __ shr(rdi, Immediate(52)); // Exponent | 
| + __ andl(rdi, Immediate(0x7ff)); | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
Should we use a couple of named constants for this
 
Lasse Reichstein
2010/05/03 10:33:44
Fixed.
 | 
| + int supported_exponent_limit = | 
| + (63 + HeapNumber::kExponentBias); | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
This will fit on line above?
 
Lasse Reichstein
2010/05/03 10:33:44
Done.
 | 
| + __ cmpl(rdi, Immediate(supported_exponent_limit)); | 
| + __ j(below, &in_range); | 
| + // Check for infinity and NaN. Both return NaN for sin. | 
| + __ cmpl(rdi, Immediate(0x7ff)); | 
| + Label non_nan_result; | 
| + __ j(not_equal, &non_nan_result); | 
| + // Input is +/-Infinity or NaN. Result is NaN. | 
| + __ fstp(0); // Clear fpu stack. | 
| + // NaN is represented by 0x7ff8000000000000. | 
| + __ movq(rdi, static_cast<uint64_t>(0x7ff8)<<48, RelocInfo::NONE); | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
We should use a named constant for the nan represe
 
Lasse Reichstein
2010/05/03 10:33:44
Done.
 | 
| + __ push(rdi); | 
| + __ fld_d(Operand(rsp, 0)); | 
| + __ addq(rsp, Immediate(kPointerSize)); | 
| + __ jmp(&done); | 
| + | 
| + __ bind(&non_nan_result); | 
| + | 
| + // Use fpmod to restrict argument to the range +/-2*PI. | 
| + __ movq(rdi, rax); // Save rax before using fnstsw_ax. | 
| + __ fldpi(); | 
| + __ fadd(0); | 
| + __ fld(1); | 
| + // FPU Stack: input, 2*pi, input. | 
| + { | 
| + Label no_exceptions; | 
| + __ fwait(); | 
| + __ fnstsw_ax(); | 
| + // Clear if Illegal Operand or Zero Division exceptions are set. | 
| + __ testl(rax, Immediate(5)); | 
| + __ j(zero, &no_exceptions); | 
| + __ fnclex(); | 
| + __ bind(&no_exceptions); | 
| + } | 
| + | 
| + // Compute st(0) % st(1) | 
| + { | 
| + Label partial_remainder_loop; | 
| + __ bind(&partial_remainder_loop); | 
| + __ fprem1(); | 
| + __ fwait(); | 
| + __ fnstsw_ax(); | 
| + __ testl(rax, Immediate(0x400 /* C2 */)); | 
| 
Mads Ager (chromium)
2010/05/03 09:14:46
We usually do not put comment in here. This seems
 
Lasse Reichstein
2010/05/03 10:33:44
C2 moved to comment.
 | 
| + // If C2 is set, computation only has partial result. Loop to | 
| + // continue computation. | 
| + __ j(not_zero, &partial_remainder_loop); | 
| + } | 
| + // FPU Stack: input, 2*pi, input % 2*pi | 
| + __ fstp(2); | 
| + // FPU Stack: input % 2*pi, 2*pi, | 
| + __ fstp(0); | 
| + // FPU Stack: input % 2*pi | 
| + __ movq(rax, rdi); // Restore eax (allocated HeapNumber pointer). | 
| 
Lasse Reichstein
2010/05/03 10:33:44
eax->rax.
 | 
| + | 
| + // FPU Stack: input % 2*pi | 
| + __ bind(&in_range); | 
| + switch (type_) { | 
| + case TranscendentalCache::SIN: | 
| + __ fsin(); | 
| + break; | 
| + case TranscendentalCache::COS: | 
| + __ fcos(); | 
| + break; | 
| + default: | 
| + UNREACHABLE(); | 
| + } | 
| + __ bind(&done); | 
| +} | 
| + | 
| + | 
| // Get the integer part of a heap number. Surprisingly, all this bit twiddling | 
| // is faster than using the built-in instructions on floating point registers. | 
| // Trashes rdi and rbx. Dest is rcx. Source cannot be rcx or one of the |