 Chromium Code Reviews
 Chromium Code Reviews Issue 652041:
  IA32: Native access to TranscendentalCache for sin/cos.  (Closed)
    
  
    Issue 652041:
  IA32: Native access to TranscendentalCache for sin/cos.  (Closed) 
  | OLD | NEW | 
|---|---|
| 1 // Copyright 2010 the V8 project authors. All rights reserved. | 1 // Copyright 2010 the V8 project authors. All rights reserved. | 
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without | 
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are | 
| 4 // met: | 4 // met: | 
| 5 // | 5 // | 
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright | 
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. | 
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above | 
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following | 
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided | 
| (...skipping 5807 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5818 ASSERT_EQ(args->length(), 1); | 5818 ASSERT_EQ(args->length(), 1); | 
| 5819 | 5819 | 
| 5820 // Load the argument on the stack and call the stub. | 5820 // Load the argument on the stack and call the stub. | 
| 5821 Load(args->at(0)); | 5821 Load(args->at(0)); | 
| 5822 NumberToStringStub stub; | 5822 NumberToStringStub stub; | 
| 5823 Result result = frame_->CallStub(&stub, 1); | 5823 Result result = frame_->CallStub(&stub, 1); | 
| 5824 frame_->Push(&result); | 5824 frame_->Push(&result); | 
| 5825 } | 5825 } | 
| 5826 | 5826 | 
| 5827 | 5827 | 
| 5828 void CodeGenerator::GenerateMathSin(ZoneList<Expression*>* args) { | |
| 5829 ASSERT_EQ(args->length(), 1); | |
| 5830 Load(args->at(0)); | |
| 5831 TranscendentalCacheStub stub(TranscendentalCache::SIN); | |
| 5832 Result result = frame_->CallStub(&stub, 1); | |
| 5833 frame_->Push(&result); | |
| 5834 } | |
| 5835 | |
| 5836 | |
| 5837 void CodeGenerator::GenerateMathCos(ZoneList<Expression*>* args) { | |
| 5838 ASSERT_EQ(args->length(), 1); | |
| 5839 Load(args->at(0)); | |
| 5840 TranscendentalCacheStub stub(TranscendentalCache::COS); | |
| 5841 Result result = frame_->CallStub(&stub, 1); | |
| 5842 frame_->Push(&result); | |
| 5843 } | |
| 5844 | |
| 5845 | |
| 5828 void CodeGenerator::VisitCallRuntime(CallRuntime* node) { | 5846 void CodeGenerator::VisitCallRuntime(CallRuntime* node) { | 
| 5829 if (CheckForInlineRuntimeCall(node)) { | 5847 if (CheckForInlineRuntimeCall(node)) { | 
| 5830 return; | 5848 return; | 
| 5831 } | 5849 } | 
| 5832 | 5850 | 
| 5833 ZoneList<Expression*>* args = node->arguments(); | 5851 ZoneList<Expression*>* args = node->arguments(); | 
| 5834 Comment cmnt(masm_, "[ CallRuntime"); | 5852 Comment cmnt(masm_, "[ CallRuntime"); | 
| 5835 Runtime::Function* function = node->function(); | 5853 Runtime::Function* function = node->function(); | 
| 5836 | 5854 | 
| 5837 if (function == NULL) { | 5855 if (function == NULL) { | 
| (...skipping 2278 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 8116 // If arguments are not passed in registers remove them from the stack before | 8134 // If arguments are not passed in registers remove them from the stack before | 
| 8117 // returning. | 8135 // returning. | 
| 8118 if (!HasArgsInRegisters()) { | 8136 if (!HasArgsInRegisters()) { | 
| 8119 __ ret(2 * kPointerSize); // Remove both operands | 8137 __ ret(2 * kPointerSize); // Remove both operands | 
| 8120 } else { | 8138 } else { | 
| 8121 __ ret(0); | 8139 __ ret(0); | 
| 8122 } | 8140 } | 
| 8123 } | 8141 } | 
| 8124 | 8142 | 
| 8125 | 8143 | 
| 8144 void TranscendentalCacheStub::Generate(MacroAssembler* masm) { | |
| 8145 // Input on stack: | |
| 8146 // esp[4]: argument (should be number). | |
| 8147 // esp[0]: return address. | |
| 8148 // Test that eax is a number. | |
| 8149 Label runtime_call; | |
| 8150 Label runtime_call_clear_stack; | |
| 8151 Label input_not_smi; | |
| 8152 Label loaded; | |
| 8153 __ mov(eax, Operand(esp, kPointerSize)); | |
| 8154 __ test(eax, Immediate(kSmiTagMask)); | |
| 8155 __ j(not_zero, &input_not_smi); | |
| 8156 // Input is a smi. Untag and load it onto the FPU stack. | |
| 8157 // Then load the low and high words of the double into ebx, edx. | |
| 8158 ASSERT_EQ(1, kSmiTagSize); | |
| 8159 __ sar(eax, 1); | |
| 8160 __ sub(Operand(esp), Immediate(2 * kPointerSize)); | |
| 8161 __ mov(Operand(esp, 0), eax); | |
| 8162 __ fild_s(Operand(esp, 0)); | |
| 8163 __ fst_d(Operand(esp, 0)); | |
| 8164 __ pop(edx); | |
| 8165 __ pop(ebx); | |
| 8166 __ jmp(&loaded); | |
| 8167 __ bind(&input_not_smi); | |
| 8168 // Check if input is a HeapNumber. | |
| 8169 __ mov(ebx, FieldOperand(eax, HeapObject::kMapOffset)); | |
| 8170 __ cmp(Operand(ebx), Immediate(Factory::heap_number_map())); | |
| 8171 __ j(not_equal, &runtime_call); | |
| 8172 // Input is a HeapNumber. Push it on the FPU stack and load its | |
| 8173 // low and high words into ebx, edx. | |
| 8174 __ fld_d(FieldOperand(eax, HeapNumber::kValueOffset)); | |
| 
fschneider
2010/02/22 17:42:54
Just an idea: Could you optimize the FPU push/pop
 
Lasse Reichstein
2010/02/23 10:18:53
Probably. I would need a flag, or two different pa
 | |
| 8175 __ mov(edx, FieldOperand(eax, HeapNumber::kExponentOffset)); | |
| 8176 __ mov(ebx, FieldOperand(eax, HeapNumber::kMantissaOffset)); | |
| 8177 | |
| 8178 __ bind(&loaded); | |
| 8179 // ST[0] == double value | |
| 8180 // ebx = low 32 bits of double value | |
| 8181 // edx = high 32 bits of double value | |
| 8182 // Compute hash: | |
| 8183 // h = (low ^ high); h ^= h >> 16; h ^= h >> 8; h = h & (cacheSize - 1); | |
| 8184 __ mov(ecx, ebx); | |
| 8185 __ xor_(ecx, Operand(edx)); | |
| 8186 __ mov(eax, ecx); | |
| 8187 __ sar(eax, 16); | |
| 8188 __ xor_(ecx, Operand(eax)); | |
| 8189 __ mov(eax, ecx); | |
| 8190 __ sar(eax, 8); | |
| 8191 __ xor_(ecx, Operand(eax)); | |
| 8192 __ and_(Operand(ecx), Immediate(TranscendentalCache::kCacheSize - 1)); | |
| 
fschneider
2010/02/22 17:42:54
This assumes that kCacheSize is a power of two. I'
 
Lasse Reichstein
2010/02/23 10:18:53
Well spotted. I moved this line up here but forgot
 | |
| 8193 // ST[0] == double value. | |
| 8194 // ebx = low 32 bits of double value. | |
| 8195 // edx = high 32 bits of double value. | |
| 8196 // ecx = TranscendentalCache::hash(double value). | |
| 8197 ASSERT(IsPowerOf2(TranscendentalCache::kCacheSize)); | |
| 
fschneider
2010/02/22 17:42:54
Move this ASSERT to above.
 | |
| 8198 __ mov(eax, | |
| 8199 Immediate(ExternalReference::transcendental_cache_array_address())); | |
| 8200 // Eax points to cache array. | |
| 8201 __ mov(eax, Operand(eax, type_ * sizeof(TranscendentalCache::caches_[0]))); | |
| 8202 // Eax points to the cache for the type type_. | |
| 8203 // If NULL, the cache hasn't been initialized yet, so go through runtime. | |
| 8204 __ test(eax, Operand(eax)); | |
| 8205 __ j(zero, &runtime_call_clear_stack); | |
| 8206 #ifdef DEBUG | |
| 8207 // Check that the layout of cache elements match expectations. | |
| 8208 { // NOLINT - doesn't like a single brace on a line. | |
| 8209 TranscendentalCache::Element test_elem[2]; | |
| 8210 char* elem_start = reinterpret_cast<char*>(&test_elem[0]); | |
| 8211 char* elem2_start = reinterpret_cast<char*>(&test_elem[1]); | |
| 8212 char* elem_in0 = reinterpret_cast<char*>(&(test_elem[0].in[0])); | |
| 8213 char* elem_in1 = reinterpret_cast<char*>(&(test_elem[0].in[1])); | |
| 8214 char* elem_out = reinterpret_cast<char*>(&(test_elem[0].output)); | |
| 8215 CHECK_EQ(12, elem2_start - elem_start); // Two uint_32's and a pointer. | |
| 8216 CHECK_EQ(0, elem_in0 - elem_start); | |
| 8217 CHECK_EQ(kIntSize, elem_in1 - elem_start); | |
| 8218 CHECK_EQ(2 * kIntSize, elem_out - elem_start); | |
| 8219 } | |
| 8220 #endif | |
| 8221 // Find the address of the ecx'th entry in the cache, i.e., &eax[ecx*12]. | |
| 8222 __ lea(ecx, Operand(ecx, ecx, times_2, 0)); | |
| 8223 __ lea(ecx, Operand(eax, ecx, times_4, 0)); | |
| 8224 // Check if cache matches: Double value is stored in uint32_t[2] array. | |
| 8225 Label cache_miss; | |
| 8226 __ cmp(ebx, Operand(ecx, 0)); | |
| 8227 __ j(not_equal, &cache_miss); | |
| 8228 __ cmp(edx, Operand(ecx, kIntSize)); // NOLINT | |
| 
fschneider
2010/02/22 17:42:54
Isn't this always half the size of a double (32 bi
 
Lasse Reichstein
2010/02/23 10:18:53
It should be. The cache element holds two integers
 | |
| 8229 __ j(not_equal, &cache_miss); | |
| 8230 // Cache hit! | |
| 8231 __ mov(eax, Operand(ecx, 2 * kIntSize)); // NOLINT | |
| 8232 __ fstp(0); | |
| 
fschneider
2010/02/22 17:42:54
Could this pop() of the FPU stack go away? (see my
 
Lasse Reichstein
2010/02/23 10:18:53
I don't think it's worth it.
In the smi case, I ne
 | |
| 8233 __ ret(kPointerSize); | |
| 8234 | |
| 8235 __ bind(&cache_miss); | |
| 8236 // Update cache with new value. | |
| 8237 // We are short on registers, so use no_reg as scratch. | |
| 8238 // This gives slightly larger code. | |
| 8239 __ AllocateHeapNumber(eax, edi, no_reg, &runtime_call_clear_stack); | |
| 8240 GenerateOperation(masm); | |
| 8241 __ mov(Operand(ecx, 0), ebx); | |
| 8242 __ mov(Operand(ecx, sizeof(uint32_t)), edx); // NOLINT | |
| 8243 __ mov(Operand(ecx, sizeof(uint32_t[2])), eax); // NOLINT | |
| 8244 __ fstp_d(FieldOperand(eax, HeapNumber::kValueOffset)); | |
| 8245 __ ret(kPointerSize); | |
| 8246 | |
| 8247 __ bind(&runtime_call_clear_stack); | |
| 8248 __ fstp(0); | |
| 8249 __ bind(&runtime_call); | |
| 8250 __ TailCallRuntime(ExternalReference(RuntimeFunction()), 1, 1); | |
| 8251 } | |
| 8252 | |
| 8253 | |
| 8254 Runtime::FunctionId TranscendentalCacheStub::RuntimeFunction() { | |
| 8255 switch (type_) { | |
| 8256 // Add more cases when necessary. | |
| 8257 case TranscendentalCache::SIN: return Runtime::kMath_sin; | |
| 8258 case TranscendentalCache::COS: return Runtime::kMath_cos; | |
| 8259 default: | |
| 8260 UNIMPLEMENTED(); | |
| 8261 return Runtime::kAbort; | |
| 8262 } | |
| 8263 } | |
| 8264 | |
| 8265 | |
| 8266 void TranscendentalCacheStub::GenerateOperation(MacroAssembler* masm) { | |
| 8267 // Only free register is edi. | |
| 8268 Label done; | |
| 8269 switch (type_) { | |
| 8270 case TranscendentalCache::SIN: | |
| 8271 case TranscendentalCache::COS: { | |
| 
fschneider
2010/02/22 17:42:54
Are there potentially more types of transcendental
 
Lasse Reichstein
2010/02/23 10:18:53
There are potentially more, some of which won't ne
 | |
| 8272 // Both fsin and fcos require arguments in the range +/-2^63 and | |
| 8273 // return NaN for infinities and NaN. They can share all code except | |
| 8274 // the actual fsin/fcos operation. | |
| 8275 Label in_range; | |
| 8276 // If argument is outside the range -2^63..2^63, fsin/cos doesn't | |
| 8277 // work. We must reduce it to the appropriate range. | |
| 8278 __ mov(edi, edx); | |
| 8279 __ and_(Operand(edi), Immediate(0x7ff00000)); // Exponent only. | |
| 8280 int supported_exponent_limit = | |
| 8281 (63 + HeapNumber::kExponentBias) << HeapNumber::kExponentShift; | |
| 8282 __ cmp(Operand(edi), Immediate(supported_exponent_limit)); | |
| 8283 __ j(below, &in_range, taken); | |
| 8284 // Check for infinity and NaN. Both return NaN for sin. | |
| 8285 __ cmp(Operand(edi), Immediate(0x7ff00000)); | |
| 8286 Label non_nan_result; | |
| 8287 __ j(not_equal, &non_nan_result, taken); | |
| 8288 // Input is +/-Infinity or NaN. Result is NaN. | |
| 8289 __ fstp(0); | |
| 8290 // NaN is represented by 0x7ff8000000000000. | |
| 8291 __ push(Immediate(0x7ff80000)); | |
| 8292 __ push(Immediate(0)); | |
| 8293 __ fld_d(Operand(esp, 0)); | |
| 8294 __ add(Operand(esp), Immediate(2 * kPointerSize)); | |
| 8295 __ jmp(&done); | |
| 8296 | |
| 8297 __ bind(&non_nan_result); | |
| 8298 | |
| 8299 // Use fpmod to restrict argument to the range +/-2*PI. | |
| 8300 __ mov(edi, eax); // Save eax before using fnstsw_ax. | |
| 8301 __ fldpi(); | |
| 8302 __ fadd(0); | |
| 8303 __ fld(1); | |
| 8304 // FPU Stack: input, 2*pi, input. | |
| 8305 { | |
| 8306 Label no_exceptions; | |
| 8307 __ fwait(); | |
| 8308 __ fnstsw_ax(); | |
| 8309 // Clear if Illegal Operand or Zero Division exceptions are set. | |
| 8310 __ test(Operand(eax), Immediate(5)); | |
| 8311 __ j(zero, &no_exceptions); | |
| 8312 __ fnclex(); | |
| 8313 __ bind(&no_exceptions); | |
| 8314 } | |
| 8315 | |
| 8316 // Compute st(0) % st(1) | |
| 8317 { | |
| 8318 Label partial_remainder_loop; | |
| 8319 __ bind(&partial_remainder_loop); | |
| 8320 __ fprem(); | |
| 
fschneider
2010/02/22 17:42:54
Is there a reason for not using fprem1()?
 
Lasse Reichstein
2010/02/23 10:18:53
It is slightly slower on some chips, but it also g
 | |
| 8321 __ fwait(); | |
| 8322 __ fnstsw_ax(); | |
| 8323 __ test(Operand(eax), Immediate(0x400 /* C2 */)); | |
| 8324 // If C2 is set, computation only has partial result. Loop to | |
| 8325 // continue computation. | |
| 8326 __ j(not_zero, &partial_remainder_loop); | |
| 8327 } | |
| 8328 // FPU Stack: input, 2*pi, input % 2*pi | |
| 8329 __ fstp(2); | |
| 8330 __ fstp(0); | |
| 8331 __ mov(eax, edi); // Restore eax (allocated HeapNumber pointer). | |
| 8332 | |
| 8333 // FPU Stack: input % 2*pi | |
| 8334 __ bind(&in_range); | |
| 8335 switch (type_) { | |
| 8336 case TranscendentalCache::SIN: | |
| 8337 __ fsin(); | |
| 8338 break; | |
| 8339 case TranscendentalCache::COS: | |
| 8340 __ fcos(); | |
| 8341 break; | |
| 8342 default: | |
| 8343 UNREACHABLE(); | |
| 8344 } | |
| 8345 break; | |
| 8346 } | |
| 8347 default: | |
| 8348 UNIMPLEMENTED(); | |
| 8349 } | |
| 8350 __ bind(&done); | |
| 8351 } | |
| 8352 | |
| 8353 | |
| 8126 // Get the integer part of a heap number. Surprisingly, all this bit twiddling | 8354 // Get the integer part of a heap number. Surprisingly, all this bit twiddling | 
| 8127 // is faster than using the built-in instructions on floating point registers. | 8355 // is faster than using the built-in instructions on floating point registers. | 
| 8128 // Trashes edi and ebx. Dest is ecx. Source cannot be ecx or one of the | 8356 // Trashes edi and ebx. Dest is ecx. Source cannot be ecx or one of the | 
| 8129 // trashed registers. | 8357 // trashed registers. | 
| 8130 void IntegerConvert(MacroAssembler* masm, | 8358 void IntegerConvert(MacroAssembler* masm, | 
| 8131 Register source, | 8359 Register source, | 
| 8132 bool use_sse3, | 8360 bool use_sse3, | 
| 8133 Label* conversion_failure) { | 8361 Label* conversion_failure) { | 
| 8134 ASSERT(!source.is(ecx) && !source.is(edi) && !source.is(ebx)); | 8362 ASSERT(!source.is(ecx) && !source.is(edi) && !source.is(ebx)); | 
| 8135 Label done, right_exponent, normal_exponent; | 8363 Label done, right_exponent, normal_exponent; | 
| (...skipping 2657 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 10793 | 11021 | 
| 10794 // Call the runtime; it returns -1 (less), 0 (equal), or 1 (greater) | 11022 // Call the runtime; it returns -1 (less), 0 (equal), or 1 (greater) | 
| 10795 // tagged as a small integer. | 11023 // tagged as a small integer. | 
| 10796 __ bind(&runtime); | 11024 __ bind(&runtime); | 
| 10797 __ TailCallRuntime(ExternalReference(Runtime::kStringCompare), 2, 1); | 11025 __ TailCallRuntime(ExternalReference(Runtime::kStringCompare), 2, 1); | 
| 10798 } | 11026 } | 
| 10799 | 11027 | 
| 10800 #undef __ | 11028 #undef __ | 
| 10801 | 11029 | 
| 10802 } } // namespace v8::internal | 11030 } } // namespace v8::internal | 
| OLD | NEW |