X64: Port inline transcendental cache to X64.

Review URL: http://codereview.chromium.org/1860001 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4567 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2010-05-03 10:43:49 +00:00 · 2010-05-03 10:43:49 +00:00 · 1790c3534a
commit 1790c3534a
parent 9d5f6d0b46
7 changed files with 354 additions and 23 deletions
--- a/src/objects.h
+++ b/src/objects.h
@ -1116,6 +1116,8 @@ class HeapNumber: public HeapObject {
  static const uint32_t kSignMask = 0x80000000u;
  static const uint32_t kExponentMask = 0x7ff00000u;
  static const uint32_t kMantissaMask = 0xfffffu;
+  static const int kMantissaBits = 52;
+  static const int KExponentBits = 11;
  static const int kExponentBias = 1023;
  static const int kExponentShift = 20;
  static const int kMantissaBitsInTopWord = 20;
--- a/src/x64/assembler-x64-inl.h
+++ b/src/x64/assembler-x64-inl.h
@ -89,6 +89,11 @@ void Assembler::emit_rex_64(XMMRegister reg, Register rm_reg) {
 }


+void Assembler::emit_rex_64(Register reg, XMMRegister rm_reg) {
+  emit(0x48 | (reg.code() & 0x8) >> 1 | rm_reg.code() >> 3);
+}
+
+
 void Assembler::emit_rex_64(Register reg, const Operand& op) {
  emit(0x48 | reg.high_bit() << 2 | op.rex_);
 }
@ -160,6 +165,12 @@ void Assembler::emit_optional_rex_32(XMMRegister reg, Register base) {
 }


+void Assembler::emit_optional_rex_32(Register reg, XMMRegister base) {
+  byte rex_bits =  (reg.code() & 0x8) >> 1 | (base.code() & 0x8) >> 3;
+  if (rex_bits != 0) emit(0x40 | rex_bits);
+}
+
+
 void Assembler::emit_optional_rex_32(Register rm_reg) {
  if (rm_reg.high_bit()) emit(0x41);
 }
--- a/src/x64/assembler-x64.cc
+++ b/src/x64/assembler-x64.cc
@ -2042,6 +2042,14 @@ void Assembler::fldz() {
 }


+void Assembler::fldpi() {
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  emit(0xD9);
+  emit(0xEB);
+}
+
+
 void Assembler::fld_s(const Operand& adr) {
  EnsureSpace ensure_space(this);
  last_pc_ = pc_;
@ -2400,6 +2408,53 @@ void Assembler::movd(XMMRegister dst, Register src) {
 }


+void Assembler::movd(Register dst, XMMRegister src) {
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  emit(0x66);
+  emit_optional_rex_32(dst, src);
+  emit(0x0F);
+  emit(0x7E);
+  emit_sse_operand(dst, src);
+}
+
+
+void Assembler::movq(XMMRegister dst, Register src) {
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  emit(0x66);
+  emit_rex_64(dst, src);
+  emit(0x0F);
+  emit(0x6E);
+  emit_sse_operand(dst, src);
+}
+
+
+void Assembler::movq(Register dst, XMMRegister src) {
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  emit(0x66);
+  emit_rex_64(dst, src);
+  emit(0x0F);
+  emit(0x7E);
+  emit_sse_operand(dst, src);
+}
+
+
+void Assembler::extractps(Register dst, XMMRegister src, byte imm8) {
+  ASSERT(is_uint2(imm8));
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  emit(0x66);
+  emit_optional_rex_32(dst, src);
+  emit(0x0F);
+  emit(0x3A);
+  emit(0x17);
+  emit_sse_operand(dst, src);
+  emit(imm8);
+}
+
+
 void Assembler::movsd(const Operand& dst, XMMRegister src) {
  EnsureSpace ensure_space(this);
  last_pc_ = pc_;
@ -2601,6 +2656,10 @@ void Assembler::emit_sse_operand(XMMRegister dst, Register src) {
  emit(0xC0 | (dst.low_bits() << 3) | src.low_bits());
 }

+void Assembler::emit_sse_operand(Register dst, XMMRegister src) {
+  emit(0xC0 | (dst.low_bits() << 3) | src.low_bits());
+}
+

 // Relocation information implementations.

--- a/src/x64/assembler-x64.h
+++ b/src/x64/assembler-x64.h
@ -1019,6 +1019,7 @@ class Assembler : public Malloced {

  void fld1();
  void fldz();
+  void fldpi();

  void fld_s(const Operand& adr);
  void fld_d(const Operand& adr);
@ -1080,6 +1081,10 @@ class Assembler : public Malloced {

  // SSE2 instructions
  void movd(XMMRegister dst, Register src);
+  void movd(Register dst, XMMRegister src);
+  void movq(XMMRegister dst, Register src);
+  void movq(Register dst, XMMRegister src);
+  void extractps(Register dst, XMMRegister src, byte imm8);

  void movsd(const Operand& dst, XMMRegister src);
  void movsd(XMMRegister dst, XMMRegister src);
@ -1110,6 +1115,7 @@ class Assembler : public Malloced {
  void emit_sse_operand(XMMRegister dst, XMMRegister src);
  void emit_sse_operand(XMMRegister reg, const Operand& adr);
  void emit_sse_operand(XMMRegister dst, Register src);
+  void emit_sse_operand(Register dst, XMMRegister src);

  // Use either movsd or movlpd.
  // void movdbl(XMMRegister dst, const Operand& src);
@ -1176,8 +1182,9 @@ class Assembler : public Malloced {
  // the top bit of both register codes.
  // High bit of reg goes to REX.R, high bit of rm_reg goes to REX.B.
  // REX.W is set.
-  inline void emit_rex_64(Register reg, Register rm_reg);
  inline void emit_rex_64(XMMRegister reg, Register rm_reg);
+  inline void emit_rex_64(Register reg, XMMRegister rm_reg);
+  inline void emit_rex_64(Register reg, Register rm_reg);

  // Emits a REX prefix that encodes a 64-bit operand size and
  // the top bit of the destination, index, and base register codes.
@ -1235,9 +1242,13 @@ class Assembler : public Malloced {
  inline void emit_optional_rex_32(XMMRegister reg, XMMRegister base);

  // As for emit_optional_rex_32(Register, Register), except that
-  // the registers are XMM registers.
+  // one of the registers is an XMM registers.
  inline void emit_optional_rex_32(XMMRegister reg, Register base);

+  // As for emit_optional_rex_32(Register, Register), except that
+  // one of the registers is an XMM registers.
+  inline void emit_optional_rex_32(Register reg, XMMRegister base);
+
  // As for emit_optional_rex_32(Register, const Operand&), except that
  // the register is an XMM register.
  inline void emit_optional_rex_32(XMMRegister reg, const Operand& op);
--- a/src/x64/codegen-x64.cc
+++ b/src/x64/codegen-x64.cc
@ -43,6 +43,8 @@ namespace internal {

 #define __ ACCESS_MASM(masm_)

+static int64_t kNaNValue = V8_INT64_C(0x7ff8000000000000);
+
 // -------------------------------------------------------------------------
 // Platform-specific DeferredCode functions.

@ -4516,19 +4518,19 @@ void CodeGenerator::GenerateCallFunction(ZoneList<Expression*>* args) {

 void CodeGenerator::GenerateMathSin(ZoneList<Expression*>* args) {
  ASSERT_EQ(args->length(), 1);
-  // Load the argument on the stack and jump to the runtime.
  Load(args->at(0));
-  Result answer = frame_->CallRuntime(Runtime::kMath_sin, 1);
-  frame_->Push(&answer);
+  TranscendentalCacheStub stub(TranscendentalCache::SIN);
+  Result result = frame_->CallStub(&stub, 1);
+  frame_->Push(&result);
 }


 void CodeGenerator::GenerateMathCos(ZoneList<Expression*>* args) {
  ASSERT_EQ(args->length(), 1);
-  // Load the argument on the stack and jump to the runtime.
  Load(args->at(0));
-  Result answer = frame_->CallRuntime(Runtime::kMath_cos, 1);
-  frame_->Push(&answer);
+  TranscendentalCacheStub stub(TranscendentalCache::COS);
+  Result result = frame_->CallStub(&stub, 1);
+  frame_->Push(&result);
 }


@ -7459,6 +7461,216 @@ bool CodeGenerator::FoldConstantSmis(Token::Value op, int left, int right) {

 // End of CodeGenerator implementation.

+void TranscendentalCacheStub::Generate(MacroAssembler* masm) {
+  // Input on stack:
+  // rsp[8]: argument (should be number).
+  // rsp[0]: return address.
+  Label runtime_call;
+  Label runtime_call_clear_stack;
+  Label input_not_smi;
+  Label loaded;
+  // Test that rax is a number.
+  __ movq(rax, Operand(rsp, kPointerSize));
+  __ JumpIfNotSmi(rax, &input_not_smi);
+  // Input is a smi. Untag and load it onto the FPU stack.
+  // Then load the bits of the double into rbx.
+  __ SmiToInteger32(rax, rax);
+  __ subq(rsp, Immediate(kPointerSize));
+  __ cvtlsi2sd(xmm1, rax);
+  __ movsd(Operand(rsp, 0), xmm1);
+  __ movq(rbx, xmm1);
+  __ movq(rdx, xmm1);
+  __ fld_d(Operand(rsp, 0));
+  __ addq(rsp, Immediate(kPointerSize));
+  __ jmp(&loaded);
+
+  __ bind(&input_not_smi);
+  // Check if input is a HeapNumber.
+  __ Move(rbx, Factory::heap_number_map());
+  __ cmpq(rbx, FieldOperand(rax, HeapObject::kMapOffset));
+  __ j(not_equal, &runtime_call);
+  // Input is a HeapNumber. Push it on the FPU stack and load its
+  // bits into rbx.
+  __ fld_d(FieldOperand(rax, HeapNumber::kValueOffset));
+  __ movq(rbx, FieldOperand(rax, HeapNumber::kValueOffset));
+  __ movq(rdx, rbx);
+  __ bind(&loaded);
+  // ST[0] == double value
+  // rbx = bits of double value.
+  // rdx = also bits of double value.
+  // Compute hash (h is 32 bits, bits are 64):
+  //   h = h0 = bits ^ (bits >> 32);
+  //   h ^= h >> 16;
+  //   h ^= h >> 8;
+  //   h = h & (cacheSize - 1);
+  // or h = (h0 ^ (h0 >> 8) ^ (h0 >> 16) ^ (h0 >> 24)) & (cacheSize - 1)
+  __ sar(rdx, Immediate(32));
+  __ xorl(rdx, rbx);
+  __ movl(rcx, rdx);
+  __ movl(rax, rdx);
+  __ movl(rdi, rdx);
+  __ sarl(rdx, Immediate(8));
+  __ sarl(rcx, Immediate(16));
+  __ sarl(rax, Immediate(24));
+  __ xorl(rcx, rdx);
+  __ xorl(rax, rdi);
+  __ xorl(rcx, rax);
+  ASSERT(IsPowerOf2(TranscendentalCache::kCacheSize));
+  __ andl(rcx, Immediate(TranscendentalCache::kCacheSize - 1));
+  // ST[0] == double value.
+  // rbx = bits of double value.
+  // rcx = TranscendentalCache::hash(double value).
+  __ movq(rax, ExternalReference::transcendental_cache_array_address());
+  // rax points to cache array.
+  __ movq(rax, Operand(rax, type_ * sizeof(TranscendentalCache::caches_[0])));
+  // rax points to the cache for the type type_.
+  // If NULL, the cache hasn't been initialized yet, so go through runtime.
+  __ testq(rax, rax);
+  __ j(zero, &runtime_call_clear_stack);
+#ifdef DEBUG
+  // Check that the layout of cache elements match expectations.
+  {  // NOLINT - doesn't like a single brace on a line.
+    TranscendentalCache::Element test_elem[2];
+    char* elem_start = reinterpret_cast<char*>(&test_elem[0]);
+    char* elem2_start = reinterpret_cast<char*>(&test_elem[1]);
+    char* elem_in0  = reinterpret_cast<char*>(&(test_elem[0].in[0]));
+    char* elem_in1  = reinterpret_cast<char*>(&(test_elem[0].in[1]));
+    char* elem_out = reinterpret_cast<char*>(&(test_elem[0].output));
+    // Two uint_32's and a pointer per element.
+    CHECK_EQ(16, static_cast<int>(elem2_start - elem_start));
+    CHECK_EQ(0, static_cast<int>(elem_in0 - elem_start));
+    CHECK_EQ(kIntSize, static_cast<int>(elem_in1 - elem_start));
+    CHECK_EQ(2 * kIntSize, static_cast<int>(elem_out - elem_start));
+  }
+#endif
+  // Find the address of the rcx'th entry in the cache, i.e., &rax[rcx*16].
+  __ addl(rcx, rcx);
+  __ lea(rcx, Operand(rax, rcx, times_8, 0));
+  // Check if cache matches: Double value is stored in uint32_t[2] array.
+  Label cache_miss;
+  __ cmpq(rbx, Operand(rcx, 0));
+  __ j(not_equal, &cache_miss);
+  // Cache hit!
+  __ movq(rax, Operand(rcx, 2 * kIntSize));
+  __ fstp(0);  // Clear FPU stack.
+  __ ret(kPointerSize);
+
+  __ bind(&cache_miss);
+  // Update cache with new value.
+  __ AllocateHeapNumber(rax, rdi, &runtime_call_clear_stack);
+  GenerateOperation(masm);
+  __ movq(Operand(rcx, 0), rbx);
+  __ movq(Operand(rcx, 2 * kIntSize), rax);
+  __ fstp_d(FieldOperand(rax, HeapNumber::kValueOffset));
+  __ ret(kPointerSize);
+
+  __ bind(&runtime_call_clear_stack);
+  __ fstp(0);
+  __ bind(&runtime_call);
+  __ TailCallExternalReference(ExternalReference(RuntimeFunction()), 1, 1);
+}
+
+
+Runtime::FunctionId TranscendentalCacheStub::RuntimeFunction() {
+  switch (type_) {
+    // Add more cases when necessary.
+    case TranscendentalCache::SIN: return Runtime::kMath_sin;
+    case TranscendentalCache::COS: return Runtime::kMath_cos;
+    default:
+      UNIMPLEMENTED();
+      return Runtime::kAbort;
+  }
+}
+
+
+void TranscendentalCacheStub::GenerateOperation(MacroAssembler* masm) {
+  // Only free register is rdi.
+  Label done;
+  ASSERT(type_ == TranscendentalCache::SIN ||
+         type_ == TranscendentalCache::COS);
+  // More transcendental types can be added later.
+
+  // Both fsin and fcos require arguments in the range +/-2^63 and
+  // return NaN for infinities and NaN. They can share all code except
+  // the actual fsin/fcos operation.
+  Label in_range;
+  // If argument is outside the range -2^63..2^63, fsin/cos doesn't
+  // work. We must reduce it to the appropriate range.
+  __ movq(rdi, rbx);
+  // Move exponent and sign bits to low bits.
+  __ shr(rdi, Immediate(HeapNumber::kMantissaBits));
+  // Remove sign bit.
+  __ andl(rdi, Immediate((1 << HeapNumber::KExponentBits) - 1));
+  int supported_exponent_limit = (63 + HeapNumber::kExponentBias);
+  __ cmpl(rdi, Immediate(supported_exponent_limit));
+  __ j(below, &in_range);
+  // Check for infinity and NaN. Both return NaN for sin.
+  __ cmpl(rdi, Immediate(0x7ff));
+  Label non_nan_result;
+  __ j(not_equal, &non_nan_result);
+  // Input is +/-Infinity or NaN. Result is NaN.
+  __ fstp(0);  // Clear fpu stack.
+  // NaN is represented by 0x7ff8000000000000.
+  __ movq(rdi, kNaNValue, RelocInfo::NONE);
+  __ push(rdi);
+  __ fld_d(Operand(rsp, 0));
+  __ addq(rsp, Immediate(kPointerSize));
+  __ jmp(&done);
+
+  __ bind(&non_nan_result);
+
+  // Use fpmod to restrict argument to the range +/-2*PI.
+  __ movq(rdi, rax);  // Save rax before using fnstsw_ax.
+  __ fldpi();
+  __ fadd(0);
+  __ fld(1);
+  // FPU Stack: input, 2*pi, input.
+  {
+    Label no_exceptions;
+    __ fwait();
+    __ fnstsw_ax();
+    // Clear if Illegal Operand or Zero Division exceptions are set.
+    __ testl(rax, Immediate(5));
+    __ j(zero, &no_exceptions);
+    __ fnclex();
+    __ bind(&no_exceptions);
+  }
+
+  // Compute st(0) % st(1)
+  {
+    Label partial_remainder_loop;
+    __ bind(&partial_remainder_loop);
+    __ fprem1();
+    __ fwait();
+    __ fnstsw_ax();
+    __ testl(rax, Immediate(0x400));  // Check C2 bit of FPU status word.
+    // If C2 is set, computation only has partial result. Loop to
+    // continue computation.
+    __ j(not_zero, &partial_remainder_loop);
+  }
+  // FPU Stack: input, 2*pi, input % 2*pi
+  __ fstp(2);
+  // FPU Stack: input % 2*pi, 2*pi,
+  __ fstp(0);
+  // FPU Stack: input % 2*pi
+  __ movq(rax, rdi);  // Restore rax (allocated HeapNumber pointer).
+
+  // FPU Stack: input % 2*pi
+  __ bind(&in_range);
+  switch (type_) {
+    case TranscendentalCache::SIN:
+      __ fsin();
+      break;
+    case TranscendentalCache::COS:
+      __ fcos();
+      break;
+    default:
+      UNREACHABLE();
+  }
+  __ bind(&done);
+}
+
+
 // Get the integer part of a heap number.  Surprisingly, all this bit twiddling
 // is faster than using the built-in instructions on floating point registers.
 // Trashes rdi and rbx.  Dest is rcx.  Source cannot be rcx or one of the
@ -10995,7 +11207,6 @@ ModuloFunction CreateModuloFunction() {
  __ testb(rax, Immediate(5));
  __ j(zero, &valid_result);
  __ fstp(0);  // Drop result in st(0).
-  int64_t kNaNValue = V8_INT64_C(0x7ff8000000000000);
  __ movq(rcx, kNaNValue, RelocInfo::NONE);
  __ movq(Operand(rsp, kPointerSize), rcx);
  __ movsd(xmm0, Operand(rsp, kPointerSize));
--- a/src/x64/codegen-x64.h
+++ b/src/x64/codegen-x64.h
@ -675,6 +675,22 @@ class CodeGenerator: public AstVisitor {
 };


+// Compute a transcendental math function natively, or call the
+// TranscendentalCache runtime function.
+class TranscendentalCacheStub: public CodeStub {
+ public:
+  explicit TranscendentalCacheStub(TranscendentalCache::Type type)
+      : type_(type) {}
+  void Generate(MacroAssembler* masm);
+ private:
+  TranscendentalCache::Type type_;
+  Major MajorKey() { return TranscendentalCache; }
+  int MinorKey() { return type_; }
+  Runtime::FunctionId RuntimeFunction();
+  void GenerateOperation(MacroAssembler* masm);
+};
+
+
 // Flag that indicates how to generate code for the stub GenericBinaryOpStub.
 enum GenericBinaryFlags {
  NO_GENERIC_BINARY_FLAGS = 0,
--- a/src/x64/disasm-x64.cc
+++ b/src/x64/disasm-x64.cc
@ -996,23 +996,44 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
  if (operand_size_ == 0x66) {
    // 0x66 0x0F prefix.
    int mod, regop, rm;
-    get_modrm(*current, &mod, &regop, &rm);
-    if (opcode == 0x6E) {
-      AppendToBuffer("movd %s,", NameOfXMMRegister(regop));
-      current += PrintRightOperand(current);
-    } else {
-      const char* mnemonic = "?";
-      if (opcode == 0x57) {
-        mnemonic = "xorpd";
-      } else if (opcode == 0x2E) {
-        mnemonic = "comisd";
-      } else if (opcode == 0x2F) {
-        mnemonic = "ucomisd";
+    if (opcode == 0x3A) {
+      byte third_byte = *current;
+      current = data + 3;
+      if (third_byte == 0x17) {
+        get_modrm(*current, &mod, &regop, &rm);
+        AppendToBuffer("extractps ");  // reg/m32, xmm, imm8
+        current += PrintRightOperand(current);
+        AppendToBuffer(", %s, %d", NameOfCPURegister(regop), (*current) & 3);
+        current += 1;
      } else {
        UnimplementedInstruction();
      }
-      AppendToBuffer("%s %s,", mnemonic, NameOfXMMRegister(regop));
-      current += PrintRightXMMOperand(current);
+    } else {
+      get_modrm(*current, &mod, &regop, &rm);
+      if (opcode == 0x6E) {
+        AppendToBuffer("mov%c %s,",
+                       rex_w() ? 'q' : 'd',
+                       NameOfXMMRegister(regop));
+        current += PrintRightOperand(current);
+      } else if (opcode == 0x7E) {
+        AppendToBuffer("mov%c %s,",
+                       rex_w() ? 'q' : 'd',
+                       NameOfCPURegister(regop));
+        current += PrintRightXMMOperand(current);
+      } else {
+        const char* mnemonic = "?";
+        if (opcode == 0x57) {
+          mnemonic = "xorpd";
+        } else if (opcode == 0x2E) {
+          mnemonic = "comisd";
+        } else if (opcode == 0x2F) {
+          mnemonic = "ucomisd";
+        } else {
+          UnimplementedInstruction();
+        }
+        AppendToBuffer("%s %s,", mnemonic, NameOfXMMRegister(regop));
+        current += PrintRightXMMOperand(current);
+      }
    }
  } else if (group_1_prefix_ == 0xF2) {
    // Beginning of instructions with prefix 0xF2.