Reland of Reland of "ARM64: Add NEON support"

This reverts commit c5aad5f284 The CL was reverted due to missing Chromium dependencies. This commit removes the simulator trace-based tests, and the associated header file dependencies, previously pulled in by DEPS. The NEON support now has only hand-written tests, in test-assembler-arm64.cc. The remaining tests can be added in a later patch. BUG=chromium:718439 Original issue's description: > Reland "ARM64: Add NEON support" > > This reverts commit cc047635ff. > The CL was reverted due to a missing DEPS mirror. > > Original issue's description: > > ARM64: Add NEON support > > > > Add assembler, disassembler and simulator support for NEON in the ARM64 backend. > > > > BUG= > > > > Review-Url: https://codereview.chromium.org/2622643005 > > Cr-Commit-Position: refs/heads/master@{#44306} > > BUG= > > Review-Url: https://codereview.chromium.org/2812573003 > Cr-Commit-Position: refs/heads/master@{#44652} Review-Url: https://codereview.chromium.org/2896303003 Cr-Commit-Position: refs/heads/master@{#45633}
2017-05-31 06:58:43 -07:00 · 2017-05-31 06:58:43 -07:00 · fc3f29d329
commit fc3f29d329
parent d8a42e4c09
34 changed files with 25275 additions and 2621 deletions
--- a/BUILD.gn
+++ b/BUILD.gn
@ -2205,6 +2205,7 @@ v8_source_set("v8_base") {
      "src/arm64/macro-assembler-arm64.h",
      "src/arm64/simulator-arm64.cc",
      "src/arm64/simulator-arm64.h",
+      "src/arm64/simulator-logic-arm64.cc",
      "src/arm64/utils-arm64.cc",
      "src/arm64/utils-arm64.h",
      "src/compiler/arm64/code-generator-arm64.cc",
--- a/src/arm64/assembler-arm64-inl.h
+++ b/src/arm64/assembler-arm64-inl.h
@ -57,6 +57,15 @@ inline int CPURegister::SizeInBytes() const {
  return reg_size / 8;
 }

+inline bool CPURegister::Is8Bits() const {
+  DCHECK(IsValid());
+  return reg_size == 8;
+}
+
+inline bool CPURegister::Is16Bits() const {
+  DCHECK(IsValid());
+  return reg_size == 16;
+}

 inline bool CPURegister::Is32Bits() const {
  DCHECK(IsValid());
@ -69,9 +78,13 @@ inline bool CPURegister::Is64Bits() const {
  return reg_size == 64;
 }

+inline bool CPURegister::Is128Bits() const {
+  DCHECK(IsValid());
+  return reg_size == 128;
+}

 inline bool CPURegister::IsValid() const {
-  if (IsValidRegister() || IsValidFPRegister()) {
+  if (IsValidRegister() || IsValidVRegister()) {
    DCHECK(!IsNone());
    return true;
  } else {
@ -87,14 +100,14 @@ inline bool CPURegister::IsValidRegister() const {
         ((reg_code < kNumberOfRegisters) || (reg_code == kSPRegInternalCode));
 }

-
-inline bool CPURegister::IsValidFPRegister() const {
-  return IsFPRegister() &&
-         ((reg_size == kSRegSizeInBits) || (reg_size == kDRegSizeInBits)) &&
-         (reg_code < kNumberOfFPRegisters);
+inline bool CPURegister::IsValidVRegister() const {
+  return IsVRegister() &&
+         ((reg_size == kBRegSizeInBits) || (reg_size == kHRegSizeInBits) ||
+          (reg_size == kSRegSizeInBits) || (reg_size == kDRegSizeInBits) ||
+          (reg_size == kQRegSizeInBits)) &&
+         (reg_code < kNumberOfVRegisters);
 }

-
 inline bool CPURegister::IsNone() const {
  // kNoRegister types should always have size 0 and code 0.
  DCHECK((reg_type != kNoRegister) || (reg_code == 0));
@ -120,11 +133,7 @@ inline bool CPURegister::IsRegister() const {
  return reg_type == kRegister;
 }

-
-inline bool CPURegister::IsFPRegister() const {
-  return reg_type == kFPRegister;
-}
-
+inline bool CPURegister::IsVRegister() const { return reg_type == kVRegister; }

 inline bool CPURegister::IsSameSizeAndType(const CPURegister& other) const {
  return (reg_size == other.reg_size) && (reg_type == other.reg_type);
@ -200,7 +209,7 @@ inline Register Register::XRegFromCode(unsigned code) {
  if (code == kSPRegInternalCode) {
    return csp;
  } else {
-    DCHECK(code < kNumberOfRegisters);
+    DCHECK_LT(code, static_cast<unsigned>(kNumberOfRegisters));
    return Register::Create(code, kXRegSizeInBits);
  }
 }
@ -210,23 +219,40 @@ inline Register Register::WRegFromCode(unsigned code) {
  if (code == kSPRegInternalCode) {
    return wcsp;
  } else {
-    DCHECK(code < kNumberOfRegisters);
+    DCHECK_LT(code, static_cast<unsigned>(kNumberOfRegisters));
    return Register::Create(code, kWRegSizeInBits);
  }
 }

-
-inline FPRegister FPRegister::SRegFromCode(unsigned code) {
-  DCHECK(code < kNumberOfFPRegisters);
-  return FPRegister::Create(code, kSRegSizeInBits);
+inline VRegister VRegister::BRegFromCode(unsigned code) {
+  DCHECK_LT(code, static_cast<unsigned>(kNumberOfVRegisters));
+  return VRegister::Create(code, kBRegSizeInBits);
 }

-
-inline FPRegister FPRegister::DRegFromCode(unsigned code) {
-  DCHECK(code < kNumberOfFPRegisters);
-  return FPRegister::Create(code, kDRegSizeInBits);
+inline VRegister VRegister::HRegFromCode(unsigned code) {
+  DCHECK_LT(code, static_cast<unsigned>(kNumberOfVRegisters));
+  return VRegister::Create(code, kHRegSizeInBits);
 }

+inline VRegister VRegister::SRegFromCode(unsigned code) {
+  DCHECK_LT(code, static_cast<unsigned>(kNumberOfVRegisters));
+  return VRegister::Create(code, kSRegSizeInBits);
+}
+
+inline VRegister VRegister::DRegFromCode(unsigned code) {
+  DCHECK_LT(code, static_cast<unsigned>(kNumberOfVRegisters));
+  return VRegister::Create(code, kDRegSizeInBits);
+}
+
+inline VRegister VRegister::QRegFromCode(unsigned code) {
+  DCHECK_LT(code, static_cast<unsigned>(kNumberOfVRegisters));
+  return VRegister::Create(code, kQRegSizeInBits);
+}
+
+inline VRegister VRegister::VRegFromCode(unsigned code) {
+  DCHECK_LT(code, static_cast<unsigned>(kNumberOfVRegisters));
+  return VRegister::Create(code, kVRegSizeInBits);
+}

 inline Register CPURegister::W() const {
  DCHECK(IsValidRegister());
@ -239,16 +265,34 @@ inline Register CPURegister::X() const {
  return Register::XRegFromCode(reg_code);
 }

-
-inline FPRegister CPURegister::S() const {
-  DCHECK(IsValidFPRegister());
-  return FPRegister::SRegFromCode(reg_code);
+inline VRegister CPURegister::V() const {
+  DCHECK(IsValidVRegister());
+  return VRegister::VRegFromCode(reg_code);
 }

+inline VRegister CPURegister::B() const {
+  DCHECK(IsValidVRegister());
+  return VRegister::BRegFromCode(reg_code);
+}

-inline FPRegister CPURegister::D() const {
-  DCHECK(IsValidFPRegister());
-  return FPRegister::DRegFromCode(reg_code);
+inline VRegister CPURegister::H() const {
+  DCHECK(IsValidVRegister());
+  return VRegister::HRegFromCode(reg_code);
+}
+
+inline VRegister CPURegister::S() const {
+  DCHECK(IsValidVRegister());
+  return VRegister::SRegFromCode(reg_code);
+}
+
+inline VRegister CPURegister::D() const {
+  DCHECK(IsValidVRegister());
+  return VRegister::DRegFromCode(reg_code);
+}
+
+inline VRegister CPURegister::Q() const {
+  DCHECK(IsValidVRegister());
+  return VRegister::QRegFromCode(reg_code);
 }


@ -491,7 +535,7 @@ MemOperand::MemOperand(Register base, const Operand& offset, AddrMode addrmode)

    regoffset_ = NoReg;
  } else if (offset.IsShiftedRegister()) {
-    DCHECK(addrmode == Offset);
+    DCHECK((addrmode == Offset) || (addrmode == PostIndex));

    regoffset_ = offset.reg();
    shift_ = offset.shift();
@ -877,21 +921,20 @@ LoadStoreOp Assembler::LoadOpFor(const CPURegister& rt) {
  if (rt.IsRegister()) {
    return rt.Is64Bits() ? LDR_x : LDR_w;
  } else {
-    DCHECK(rt.IsFPRegister());
-    return rt.Is64Bits() ? LDR_d : LDR_s;
-  }
-}
-
-
-LoadStorePairOp Assembler::LoadPairOpFor(const CPURegister& rt,
-                                         const CPURegister& rt2) {
-  DCHECK(AreSameSizeAndType(rt, rt2));
-  USE(rt2);
-  if (rt.IsRegister()) {
-    return rt.Is64Bits() ? LDP_x : LDP_w;
-  } else {
-    DCHECK(rt.IsFPRegister());
-    return rt.Is64Bits() ? LDP_d : LDP_s;
+    DCHECK(rt.IsVRegister());
+    switch (rt.SizeInBits()) {
+      case kBRegSizeInBits:
+        return LDR_b;
+      case kHRegSizeInBits:
+        return LDR_h;
+      case kSRegSizeInBits:
+        return LDR_s;
+      case kDRegSizeInBits:
+        return LDR_d;
+      default:
+        DCHECK(rt.IsQ());
+        return LDR_q;
+    }
  }
 }

@ -901,11 +944,29 @@ LoadStoreOp Assembler::StoreOpFor(const CPURegister& rt) {
  if (rt.IsRegister()) {
    return rt.Is64Bits() ? STR_x : STR_w;
  } else {
-    DCHECK(rt.IsFPRegister());
-    return rt.Is64Bits() ? STR_d : STR_s;
+    DCHECK(rt.IsVRegister());
+    switch (rt.SizeInBits()) {
+      case kBRegSizeInBits:
+        return STR_b;
+      case kHRegSizeInBits:
+        return STR_h;
+      case kSRegSizeInBits:
+        return STR_s;
+      case kDRegSizeInBits:
+        return STR_d;
+      default:
+        DCHECK(rt.IsQ());
+        return STR_q;
+    }
  }
 }

+LoadStorePairOp Assembler::LoadPairOpFor(const CPURegister& rt,
+                                         const CPURegister& rt2) {
+  DCHECK_EQ(STP_w | LoadStorePairLBit, LDP_w);
+  return static_cast<LoadStorePairOp>(StorePairOpFor(rt, rt2) |
+                                      LoadStorePairLBit);
+}

 LoadStorePairOp Assembler::StorePairOpFor(const CPURegister& rt,
                                          const CPURegister& rt2) {
@ -914,8 +975,16 @@ LoadStorePairOp Assembler::StorePairOpFor(const CPURegister& rt,
  if (rt.IsRegister()) {
    return rt.Is64Bits() ? STP_x : STP_w;
  } else {
-    DCHECK(rt.IsFPRegister());
-    return rt.Is64Bits() ? STP_d : STP_s;
+    DCHECK(rt.IsVRegister());
+    switch (rt.SizeInBits()) {
+      case kSRegSizeInBits:
+        return STP_s;
+      case kDRegSizeInBits:
+        return STP_d;
+      default:
+        DCHECK(rt.IsQ());
+        return STP_q;
+    }
  }
 }

@ -924,7 +993,7 @@ LoadLiteralOp Assembler::LoadLiteralOpFor(const CPURegister& rt) {
  if (rt.IsRegister()) {
    return rt.Is64Bits() ? LDR_x_lit : LDR_w_lit;
  } else {
-    DCHECK(rt.IsFPRegister());
+    DCHECK(rt.IsVRegister());
    return rt.Is64Bits() ? LDR_d_lit : LDR_s_lit;
  }
 }
@ -1107,9 +1176,8 @@ Instr Assembler::ImmLS(int imm9) {
  return truncate_to_int9(imm9) << ImmLS_offset;
 }

-
-Instr Assembler::ImmLSPair(int imm7, LSDataSize size) {
-  DCHECK(((imm7 >> size) << size) == imm7);
+Instr Assembler::ImmLSPair(int imm7, unsigned size) {
+  DCHECK_EQ((imm7 >> size) << size, imm7);
  int scaled_imm7 = imm7 >> size;
  DCHECK(is_int7(scaled_imm7));
  return truncate_to_int7(scaled_imm7) << ImmLSPair_offset;
@ -1151,10 +1219,17 @@ Instr Assembler::ImmBarrierType(int imm2) {
  return imm2 << ImmBarrierType_offset;
 }

-
-LSDataSize Assembler::CalcLSDataSize(LoadStoreOp op) {
-  DCHECK((SizeLS_offset + SizeLS_width) == (kInstructionSize * 8));
-  return static_cast<LSDataSize>(op >> SizeLS_offset);
+unsigned Assembler::CalcLSDataSize(LoadStoreOp op) {
+  DCHECK((LSSize_offset + LSSize_width) == (kInstructionSize * 8));
+  unsigned size = static_cast<Instr>(op >> LSSize_offset);
+  if ((op & LSVector_mask) != 0) {
+    // Vector register memory operations encode the access size in the "size"
+    // and "opc" fields.
+    if ((size == 0) && ((op & LSOpc_mask) >> LSOpc_offset) >= 2) {
+      size = kQRegSizeLog2;
+    }
+  }
+  return size;
 }


@ -1169,11 +1244,7 @@ Instr Assembler::ShiftMoveWide(int shift) {
  return shift << ShiftMoveWide_offset;
 }

-
-Instr Assembler::FPType(FPRegister fd) {
-  return fd.Is64Bits() ? FP64 : FP32;
-}
-
+Instr Assembler::FPType(VRegister fd) { return fd.Is64Bits() ? FP64 : FP32; }

 Instr Assembler::FPScale(unsigned scale) {
  DCHECK(is_uint6(scale));
--- a/src/arm64/assembler-arm64.cc
+++ b/src/arm64/assembler-arm64.cc
--- a/src/arm64/assembler-arm64.h
+++ b/src/arm64/assembler-arm64.h
--- a/src/arm64/code-stubs-arm64.cc
+++ b/src/arm64/code-stubs-arm64.cc
@ -147,8 +147,8 @@ void DoubleToIStub::Generate(MacroAssembler* masm) {
 // See call site for description.
 static void EmitIdenticalObjectComparison(MacroAssembler* masm, Register left,
                                          Register right, Register scratch,
-                                          FPRegister double_scratch,
-                                          Label* slow, Condition cond) {
+                                          VRegister double_scratch, Label* slow,
+                                          Condition cond) {
  DCHECK(!AreAliased(left, right, scratch));
  Label not_identical, return_equal, heap_number;
  Register result = x0;
@ -292,12 +292,9 @@ static void EmitStrictTwoHeapObjectCompare(MacroAssembler* masm,


 // See call site for description.
-static void EmitSmiNonsmiComparison(MacroAssembler* masm,
-                                    Register left,
-                                    Register right,
-                                    FPRegister left_d,
-                                    FPRegister right_d,
-                                    Label* slow,
+static void EmitSmiNonsmiComparison(MacroAssembler* masm, Register left,
+                                    Register right, VRegister left_d,
+                                    VRegister right_d, Label* slow,
                                    bool strict) {
  DCHECK(!AreAliased(left_d, right_d));
  DCHECK((left.is(x0) && right.is(x1)) ||
@ -476,8 +473,8 @@ void CompareICStub::GenerateGeneric(MacroAssembler* masm) {
  // In case 3, we have found out that we were dealing with a number-number
  // comparison. The double values of the numbers have been loaded, right into
  // rhs_d, left into lhs_d.
-  FPRegister rhs_d = d0;
-  FPRegister lhs_d = d1;
+  VRegister rhs_d = d0;
+  VRegister lhs_d = d1;
  EmitSmiNonsmiComparison(masm, lhs, rhs, lhs_d, rhs_d, &slow, strict());

  __ Bind(&both_loaded_as_doubles);
@ -613,7 +610,7 @@ void CompareICStub::GenerateGeneric(MacroAssembler* masm) {

 void StoreBufferOverflowStub::Generate(MacroAssembler* masm) {
  CPURegList saved_regs = kCallerSaved;
-  CPURegList saved_fp_regs = kCallerSavedFP;
+  CPURegList saved_fp_regs = kCallerSavedV;

  // We don't allow a GC during a store buffer overflow so there is no need to
  // store the registers in any particular way, but we do have to store and
@ -686,12 +683,12 @@ void MathPowStub::Generate(MacroAssembler* masm) {
  Register exponent_integer = MathPowIntegerDescriptor::exponent();
  DCHECK(exponent_integer.is(x12));
  Register saved_lr = x19;
-  FPRegister result_double = d0;
-  FPRegister base_double = d0;
-  FPRegister exponent_double = d1;
-  FPRegister base_double_copy = d2;
-  FPRegister scratch1_double = d6;
-  FPRegister scratch0_double = d7;
+  VRegister result_double = d0;
+  VRegister base_double = d0;
+  VRegister exponent_double = d1;
+  VRegister base_double_copy = d2;
+  VRegister scratch1_double = d6;
+  VRegister scratch0_double = d7;

  // A fast-path for integer exponents.
  Label exponent_is_smi, exponent_is_integer;
@ -1582,8 +1579,8 @@ void CompareICStub::GenerateNumbers(MacroAssembler* masm) {
  Register result = x0;
  Register rhs = x0;
  Register lhs = x1;
-  FPRegister rhs_d = d0;
-  FPRegister lhs_d = d1;
+  VRegister rhs_d = d0;
+  VRegister lhs_d = d1;

  if (left() == CompareICState::SMI) {
    __ JumpIfNotSmi(lhs, &miss);
@ -2042,7 +2039,7 @@ RecordWriteStub::RegisterAllocation::RegisterAllocation(Register object,
      address_(address),
      scratch0_(scratch),
      saved_regs_(kCallerSaved),
-      saved_fp_regs_(kCallerSavedFP) {
+      saved_fp_regs_(kCallerSavedV) {
  DCHECK(!AreAliased(scratch, object, address));

  // The SaveCallerSaveRegisters method needs to save caller-saved
--- a/src/arm64/constants-arm64.h
+++ b/src/arm64/constants-arm64.h
--- a/src/arm64/decoder-arm64-inl.h
+++ b/src/arm64/decoder-arm64-inl.h
@ -213,6 +213,11 @@ void Decoder<V>::DecodeLoadStore(Instruction* instr) {
         (instr->Bits(27, 24) == 0xC) ||
         (instr->Bits(27, 24) == 0xD) );

+  if ((instr->Bit(28) == 0) && (instr->Bit(29) == 0) && (instr->Bit(26) == 1)) {
+    DecodeNEONLoadStore(instr);
+    return;
+  }
+
  if (instr->Bit(24) == 0) {
    if (instr->Bit(28) == 0) {
      if (instr->Bit(29) == 0) {
@ -226,8 +231,6 @@ void Decoder<V>::DecodeLoadStore(Instruction* instr) {
          } else {
            V::VisitLoadStoreAcquireRelease(instr);
          }
-        } else {
-          DecodeAdvSIMDLoadStore(instr);
        }
      } else {
        if ((instr->Bits(31, 30) == 0x3) ||
@ -513,16 +516,14 @@ void Decoder<V>::DecodeFP(Instruction* instr) {
         (instr->Bits(27, 24) == 0xF) );

  if (instr->Bit(28) == 0) {
-    DecodeAdvSIMDDataProcessing(instr);
+    DecodeNEONVectorDataProcessing(instr);
  } else {
-    if (instr->Bit(29) == 1) {
+    if (instr->Bits(31, 30) == 0x3) {
      V::VisitUnallocated(instr);
+    } else if (instr->Bits(31, 30) == 0x1) {
+      DecodeNEONScalarDataProcessing(instr);
    } else {
-      if (instr->Bits(31, 30) == 0x3) {
-        V::VisitUnallocated(instr);
-      } else if (instr->Bits(31, 30) == 0x1) {
-        DecodeAdvSIMDDataProcessing(instr);
-      } else {
+      if (instr->Bit(29) == 0) {
        if (instr->Bit(24) == 0) {
          if (instr->Bit(21) == 0) {
            if ((instr->Bit(23) == 1) ||
@ -629,25 +630,190 @@ void Decoder<V>::DecodeFP(Instruction* instr) {
            V::VisitFPDataProcessing3Source(instr);
          }
        }
+      } else {
+        V::VisitUnallocated(instr);
      }
    }
  }
 }

-
-template<typename V>
-void Decoder<V>::DecodeAdvSIMDLoadStore(Instruction* instr) {
-  // TODO(all): Implement Advanced SIMD load/store instruction decode.
+template <typename V>
+void Decoder<V>::DecodeNEONLoadStore(Instruction* instr) {
  DCHECK(instr->Bits(29, 25) == 0x6);
-  V::VisitUnimplemented(instr);
+  if (instr->Bit(31) == 0) {
+    if ((instr->Bit(24) == 0) && (instr->Bit(21) == 1)) {
+      V::VisitUnallocated(instr);
+      return;
+    }
+
+    if (instr->Bit(23) == 0) {
+      if (instr->Bits(20, 16) == 0) {
+        if (instr->Bit(24) == 0) {
+          V::VisitNEONLoadStoreMultiStruct(instr);
+        } else {
+          V::VisitNEONLoadStoreSingleStruct(instr);
+        }
+      } else {
+        V::VisitUnallocated(instr);
+      }
+    } else {
+      if (instr->Bit(24) == 0) {
+        V::VisitNEONLoadStoreMultiStructPostIndex(instr);
+      } else {
+        V::VisitNEONLoadStoreSingleStructPostIndex(instr);
+      }
+    }
+  } else {
+    V::VisitUnallocated(instr);
+  }
 }

+template <typename V>
+void Decoder<V>::DecodeNEONVectorDataProcessing(Instruction* instr) {
+  DCHECK(instr->Bits(28, 25) == 0x7);
+  if (instr->Bit(31) == 0) {
+    if (instr->Bit(24) == 0) {
+      if (instr->Bit(21) == 0) {
+        if (instr->Bit(15) == 0) {
+          if (instr->Bit(10) == 0) {
+            if (instr->Bit(29) == 0) {
+              if (instr->Bit(11) == 0) {
+                V::VisitNEONTable(instr);
+              } else {
+                V::VisitNEONPerm(instr);
+              }
+            } else {
+              V::VisitNEONExtract(instr);
+            }
+          } else {
+            if (instr->Bits(23, 22) == 0) {
+              V::VisitNEONCopy(instr);
+            } else {
+              V::VisitUnallocated(instr);
+            }
+          }
+        } else {
+          V::VisitUnallocated(instr);
+        }
+      } else {
+        if (instr->Bit(10) == 0) {
+          if (instr->Bit(11) == 0) {
+            V::VisitNEON3Different(instr);
+          } else {
+            if (instr->Bits(18, 17) == 0) {
+              if (instr->Bit(20) == 0) {
+                if (instr->Bit(19) == 0) {
+                  V::VisitNEON2RegMisc(instr);
+                } else {
+                  if (instr->Bits(30, 29) == 0x2) {
+                    V::VisitUnallocated(instr);
+                  } else {
+                    V::VisitUnallocated(instr);
+                  }
+                }
+              } else {
+                if (instr->Bit(19) == 0) {
+                  V::VisitNEONAcrossLanes(instr);
+                } else {
+                  V::VisitUnallocated(instr);
+                }
+              }
+            } else {
+              V::VisitUnallocated(instr);
+            }
+          }
+        } else {
+          V::VisitNEON3Same(instr);
+        }
+      }
+    } else {
+      if (instr->Bit(10) == 0) {
+        V::VisitNEONByIndexedElement(instr);
+      } else {
+        if (instr->Bit(23) == 0) {
+          if (instr->Bits(22, 19) == 0) {
+            V::VisitNEONModifiedImmediate(instr);
+          } else {
+            V::VisitNEONShiftImmediate(instr);
+          }
+        } else {
+          V::VisitUnallocated(instr);
+        }
+      }
+    }
+  } else {
+    V::VisitUnallocated(instr);
+  }
+}

-template<typename V>
-void Decoder<V>::DecodeAdvSIMDDataProcessing(Instruction* instr) {
-  // TODO(all): Implement Advanced SIMD data processing instruction decode.
-  DCHECK(instr->Bits(27, 25) == 0x7);
-  V::VisitUnimplemented(instr);
+template <typename V>
+void Decoder<V>::DecodeNEONScalarDataProcessing(Instruction* instr) {
+  DCHECK(instr->Bits(28, 25) == 0xF);
+  if (instr->Bit(24) == 0) {
+    if (instr->Bit(21) == 0) {
+      if (instr->Bit(15) == 0) {
+        if (instr->Bit(10) == 0) {
+          if (instr->Bit(29) == 0) {
+            if (instr->Bit(11) == 0) {
+              V::VisitUnallocated(instr);
+            } else {
+              V::VisitUnallocated(instr);
+            }
+          } else {
+            V::VisitUnallocated(instr);
+          }
+        } else {
+          if (instr->Bits(23, 22) == 0) {
+            V::VisitNEONScalarCopy(instr);
+          } else {
+            V::VisitUnallocated(instr);
+          }
+        }
+      } else {
+        V::VisitUnallocated(instr);
+      }
+    } else {
+      if (instr->Bit(10) == 0) {
+        if (instr->Bit(11) == 0) {
+          V::VisitNEONScalar3Diff(instr);
+        } else {
+          if (instr->Bits(18, 17) == 0) {
+            if (instr->Bit(20) == 0) {
+              if (instr->Bit(19) == 0) {
+                V::VisitNEONScalar2RegMisc(instr);
+              } else {
+                if (instr->Bit(29) == 0) {
+                  V::VisitUnallocated(instr);
+                } else {
+                  V::VisitUnallocated(instr);
+                }
+              }
+            } else {
+              if (instr->Bit(19) == 0) {
+                V::VisitNEONScalarPairwise(instr);
+              } else {
+                V::VisitUnallocated(instr);
+              }
+            }
+          } else {
+            V::VisitUnallocated(instr);
+          }
+        }
+      } else {
+        V::VisitNEONScalar3Same(instr);
+      }
+    }
+  } else {
+    if (instr->Bit(10) == 0) {
+      V::VisitNEONScalarByIndexedElement(instr);
+    } else {
+      if (instr->Bit(23) == 0) {
+        V::VisitNEONScalarShiftImmediate(instr);
+      } else {
+        V::VisitUnallocated(instr);
+      }
+    }
+  }
 }


--- a/src/arm64/decoder-arm64.h
+++ b/src/arm64/decoder-arm64.h
@ -16,50 +16,72 @@ namespace internal {

 // List macro containing all visitors needed by the decoder class.

-#define VISITOR_LIST(V)            \
-  V(PCRelAddressing)               \
-  V(AddSubImmediate)               \
-  V(LogicalImmediate)              \
-  V(MoveWideImmediate)             \
-  V(Bitfield)                      \
-  V(Extract)                       \
-  V(UnconditionalBranch)           \
-  V(UnconditionalBranchToRegister) \
-  V(CompareBranch)                 \
-  V(TestBranch)                    \
-  V(ConditionalBranch)             \
-  V(System)                        \
-  V(Exception)                     \
-  V(LoadStorePairPostIndex)        \
-  V(LoadStorePairOffset)           \
-  V(LoadStorePairPreIndex)         \
-  V(LoadLiteral)                   \
-  V(LoadStoreUnscaledOffset)       \
-  V(LoadStorePostIndex)            \
-  V(LoadStorePreIndex)             \
-  V(LoadStoreRegisterOffset)       \
-  V(LoadStoreUnsignedOffset)       \
-  V(LoadStoreAcquireRelease)       \
-  V(LogicalShifted)                \
-  V(AddSubShifted)                 \
-  V(AddSubExtended)                \
-  V(AddSubWithCarry)               \
-  V(ConditionalCompareRegister)    \
-  V(ConditionalCompareImmediate)   \
-  V(ConditionalSelect)             \
-  V(DataProcessing1Source)         \
-  V(DataProcessing2Source)         \
-  V(DataProcessing3Source)         \
-  V(FPCompare)                     \
-  V(FPConditionalCompare)          \
-  V(FPConditionalSelect)           \
-  V(FPImmediate)                   \
-  V(FPDataProcessing1Source)       \
-  V(FPDataProcessing2Source)       \
-  V(FPDataProcessing3Source)       \
-  V(FPIntegerConvert)              \
-  V(FPFixedPointConvert)           \
-  V(Unallocated)                   \
+#define VISITOR_LIST(V)                 \
+  V(PCRelAddressing)                    \
+  V(AddSubImmediate)                    \
+  V(LogicalImmediate)                   \
+  V(MoveWideImmediate)                  \
+  V(Bitfield)                           \
+  V(Extract)                            \
+  V(UnconditionalBranch)                \
+  V(UnconditionalBranchToRegister)      \
+  V(CompareBranch)                      \
+  V(TestBranch)                         \
+  V(ConditionalBranch)                  \
+  V(System)                             \
+  V(Exception)                          \
+  V(LoadStorePairPostIndex)             \
+  V(LoadStorePairOffset)                \
+  V(LoadStorePairPreIndex)              \
+  V(LoadLiteral)                        \
+  V(LoadStoreUnscaledOffset)            \
+  V(LoadStorePostIndex)                 \
+  V(LoadStorePreIndex)                  \
+  V(LoadStoreRegisterOffset)            \
+  V(LoadStoreUnsignedOffset)            \
+  V(LoadStoreAcquireRelease)            \
+  V(LogicalShifted)                     \
+  V(AddSubShifted)                      \
+  V(AddSubExtended)                     \
+  V(AddSubWithCarry)                    \
+  V(ConditionalCompareRegister)         \
+  V(ConditionalCompareImmediate)        \
+  V(ConditionalSelect)                  \
+  V(DataProcessing1Source)              \
+  V(DataProcessing2Source)              \
+  V(DataProcessing3Source)              \
+  V(FPCompare)                          \
+  V(FPConditionalCompare)               \
+  V(FPConditionalSelect)                \
+  V(FPImmediate)                        \
+  V(FPDataProcessing1Source)            \
+  V(FPDataProcessing2Source)            \
+  V(FPDataProcessing3Source)            \
+  V(FPIntegerConvert)                   \
+  V(FPFixedPointConvert)                \
+  V(NEON2RegMisc)                       \
+  V(NEON3Different)                     \
+  V(NEON3Same)                          \
+  V(NEONAcrossLanes)                    \
+  V(NEONByIndexedElement)               \
+  V(NEONCopy)                           \
+  V(NEONExtract)                        \
+  V(NEONLoadStoreMultiStruct)           \
+  V(NEONLoadStoreMultiStructPostIndex)  \
+  V(NEONLoadStoreSingleStruct)          \
+  V(NEONLoadStoreSingleStructPostIndex) \
+  V(NEONModifiedImmediate)              \
+  V(NEONScalar2RegMisc)                 \
+  V(NEONScalar3Diff)                    \
+  V(NEONScalar3Same)                    \
+  V(NEONScalarByIndexedElement)         \
+  V(NEONScalarCopy)                     \
+  V(NEONScalarPairwise)                 \
+  V(NEONScalarShiftImmediate)           \
+  V(NEONShiftImmediate)                 \
+  V(NEONTable)                          \
+  V(NEONPerm)                           \
+  V(Unallocated)                        \
  V(Unimplemented)

 // The Visitor interface. Disassembler and simulator (and other tools)
@ -109,6 +131,8 @@ class DispatchingDecoderVisitor : public DecoderVisitor {
  // stored by the decoder.
  void RemoveVisitor(DecoderVisitor* visitor);

+  void VisitNEONShiftImmediate(const Instruction* instr);
+
  #define DECLARE(A) void Visit##A(Instruction* instr);
  VISITOR_LIST(DECLARE)
  #undef DECLARE
@ -173,12 +197,17 @@ class Decoder : public V {
  // Decode the Advanced SIMD (NEON) load/store part of the instruction tree,
  // and call the corresponding visitors.
  // On entry, instruction bits 29:25 = 0x6.
-  void DecodeAdvSIMDLoadStore(Instruction* instr);
+  void DecodeNEONLoadStore(Instruction* instr);

  // Decode the Advanced SIMD (NEON) data processing part of the instruction
  // tree, and call the corresponding visitors.
  // On entry, instruction bits 27:25 = 0x7.
-  void DecodeAdvSIMDDataProcessing(Instruction* instr);
+  void DecodeNEONVectorDataProcessing(Instruction* instr);
+
+  // Decode the Advanced SIMD (NEON) scalar data processing part of the
+  // instruction tree, and call the corresponding visitors.
+  // On entry, instruction bits 28:25 = 0xF.
+  void DecodeNEONScalarDataProcessing(Instruction* instr);
 };


--- a/src/arm64/deoptimizer-arm64.cc
+++ b/src/arm64/deoptimizer-arm64.cc
@ -118,13 +118,13 @@ void Deoptimizer::TableEntryGenerator::Generate() {

  // Save all allocatable double registers.
  CPURegList saved_double_registers(
-      CPURegister::kFPRegister, kDRegSizeInBits,
+      CPURegister::kVRegister, kDRegSizeInBits,
      RegisterConfiguration::Crankshaft()->allocatable_double_codes_mask());
  __ PushCPURegList(saved_double_registers);

  // Save all allocatable float registers.
  CPURegList saved_float_registers(
-      CPURegister::kFPRegister, kSRegSizeInBits,
+      CPURegister::kVRegister, kSRegSizeInBits,
      RegisterConfiguration::Crankshaft()->allocatable_float_codes_mask());
  __ PushCPURegList(saved_float_registers);

--- a/src/arm64/disasm-arm64.cc
+++ b/src/arm64/disasm-arm64.cc
--- a/src/arm64/disasm-arm64.h
+++ b/src/arm64/disasm-arm64.h
@ -5,6 +5,7 @@
 #ifndef V8_ARM64_DISASM_ARM64_H
 #define V8_ARM64_DISASM_ARM64_H

+#include "src/arm64/assembler-arm64.h"
 #include "src/arm64/decoder-arm64.h"
 #include "src/arm64/instructions-arm64.h"
 #include "src/globals.h"
@ -29,6 +30,13 @@ class DisassemblingDecoder : public DecoderVisitor {
 protected:
  virtual void ProcessOutput(Instruction* instr);

+  // Default output functions.  The functions below implement a default way of
+  // printing elements in the disassembly. A sub-class can override these to
+  // customize the disassembly output.
+
+  // Prints the name of a register.
+  virtual void AppendRegisterNameToOutput(const CPURegister& reg);
+
  void Format(Instruction* instr, const char* mnemonic, const char* format);
  void Substitute(Instruction* instr, const char* string);
  int SubstituteField(Instruction* instr, const char* format);
--- a/src/arm64/instructions-arm64.cc
+++ b/src/arm64/instructions-arm64.cc
@ -21,7 +21,7 @@ bool Instruction::IsLoad() const {
  if (Mask(LoadStorePairAnyFMask) == LoadStorePairAnyFixed) {
    return Mask(LoadStorePairLBit) != 0;
  } else {
-    LoadStoreOp op = static_cast<LoadStoreOp>(Mask(LoadStoreOpMask));
+    LoadStoreOp op = static_cast<LoadStoreOp>(Mask(LoadStoreMask));
    switch (op) {
      case LDRB_w:
      case LDRH_w:
@ -32,8 +32,12 @@ bool Instruction::IsLoad() const {
      case LDRSH_w:
      case LDRSH_x:
      case LDRSW_x:
+      case LDR_b:
+      case LDR_h:
      case LDR_s:
-      case LDR_d: return true;
+      case LDR_d:
+      case LDR_q:
+        return true;
      default: return false;
    }
  }
@ -48,14 +52,18 @@ bool Instruction::IsStore() const {
  if (Mask(LoadStorePairAnyFMask) == LoadStorePairAnyFixed) {
    return Mask(LoadStorePairLBit) == 0;
  } else {
-    LoadStoreOp op = static_cast<LoadStoreOp>(Mask(LoadStoreOpMask));
+    LoadStoreOp op = static_cast<LoadStoreOp>(Mask(LoadStoreMask));
    switch (op) {
      case STRB_w:
      case STRH_w:
      case STR_w:
      case STR_x:
+      case STR_b:
+      case STR_h:
      case STR_s:
-      case STR_d: return true;
+      case STR_d:
+      case STR_q:
+        return true;
      default: return false;
    }
  }
@ -138,43 +146,48 @@ uint64_t Instruction::ImmLogical() {
  UNREACHABLE();
 }

-
-float Instruction::ImmFP32() {
-  //  ImmFP: abcdefgh (8 bits)
-  // Single: aBbb.bbbc.defg.h000.0000.0000.0000.0000 (32 bits)
-  // where B is b ^ 1
-  uint32_t bits = ImmFP();
-  uint32_t bit7 = (bits >> 7) & 0x1;
-  uint32_t bit6 = (bits >> 6) & 0x1;
-  uint32_t bit5_to_0 = bits & 0x3f;
-  uint32_t result = (bit7 << 31) | ((32 - bit6) << 25) | (bit5_to_0 << 19);
-
-  return rawbits_to_float(result);
+uint32_t Instruction::ImmNEONabcdefgh() const {
+  return ImmNEONabc() << 5 | ImmNEONdefgh();
 }

+float Instruction::ImmFP32() { return Imm8ToFP32(ImmFP()); }

-double Instruction::ImmFP64() {
-  //  ImmFP: abcdefgh (8 bits)
-  // Double: aBbb.bbbb.bbcd.efgh.0000.0000.0000.0000
-  //         0000.0000.0000.0000.0000.0000.0000.0000 (64 bits)
-  // where B is b ^ 1
-  uint32_t bits = ImmFP();
-  uint64_t bit7 = (bits >> 7) & 0x1;
-  uint64_t bit6 = (bits >> 6) & 0x1;
-  uint64_t bit5_to_0 = bits & 0x3f;
-  uint64_t result = (bit7 << 63) | ((256 - bit6) << 54) | (bit5_to_0 << 48);
+double Instruction::ImmFP64() { return Imm8ToFP64(ImmFP()); }

-  return rawbits_to_double(result);
+float Instruction::ImmNEONFP32() const { return Imm8ToFP32(ImmNEONabcdefgh()); }
+
+double Instruction::ImmNEONFP64() const {
+  return Imm8ToFP64(ImmNEONabcdefgh());
 }

+unsigned CalcLSDataSize(LoadStoreOp op) {
+  DCHECK_EQ(static_cast<unsigned>(LSSize_offset + LSSize_width),
+            kInstructionSize * 8);
+  unsigned size = static_cast<Instr>(op) >> LSSize_offset;
+  if ((op & LSVector_mask) != 0) {
+    // Vector register memory operations encode the access size in the "size"
+    // and "opc" fields.
+    if ((size == 0) && ((op & LSOpc_mask) >> LSOpc_offset) >= 2) {
+      size = kQRegSizeLog2;
+    }
+  }
+  return size;
+}

-LSDataSize CalcLSPairDataSize(LoadStorePairOp op) {
+unsigned CalcLSPairDataSize(LoadStorePairOp op) {
+  static_assert(kXRegSize == kDRegSize, "X and D registers must be same size.");
+  static_assert(kWRegSize == kSRegSize, "W and S registers must be same size.");
  switch (op) {
+    case STP_q:
+    case LDP_q:
+      return kQRegSizeLog2;
    case STP_x:
    case LDP_x:
    case STP_d:
-    case LDP_d: return LSDoubleWord;
-    default: return LSWord;
+    case LDP_d:
+      return kXRegSizeLog2;
+    default:
+      return kWRegSizeLog2;
  }
 }

@ -333,7 +346,405 @@ uint64_t InstructionSequence::InlineData() const {
  return payload;
 }

+VectorFormat VectorFormatHalfWidth(VectorFormat vform) {
+  DCHECK(vform == kFormat8H || vform == kFormat4S || vform == kFormat2D ||
+         vform == kFormatH || vform == kFormatS || vform == kFormatD);
+  switch (vform) {
+    case kFormat8H:
+      return kFormat8B;
+    case kFormat4S:
+      return kFormat4H;
+    case kFormat2D:
+      return kFormat2S;
+    case kFormatH:
+      return kFormatB;
+    case kFormatS:
+      return kFormatH;
+    case kFormatD:
+      return kFormatS;
+    default:
+      UNREACHABLE();
+  }
+}

+VectorFormat VectorFormatDoubleWidth(VectorFormat vform) {
+  DCHECK(vform == kFormat8B || vform == kFormat4H || vform == kFormat2S ||
+         vform == kFormatB || vform == kFormatH || vform == kFormatS);
+  switch (vform) {
+    case kFormat8B:
+      return kFormat8H;
+    case kFormat4H:
+      return kFormat4S;
+    case kFormat2S:
+      return kFormat2D;
+    case kFormatB:
+      return kFormatH;
+    case kFormatH:
+      return kFormatS;
+    case kFormatS:
+      return kFormatD;
+    default:
+      UNREACHABLE();
+  }
+}
+
+VectorFormat VectorFormatFillQ(VectorFormat vform) {
+  switch (vform) {
+    case kFormatB:
+    case kFormat8B:
+    case kFormat16B:
+      return kFormat16B;
+    case kFormatH:
+    case kFormat4H:
+    case kFormat8H:
+      return kFormat8H;
+    case kFormatS:
+    case kFormat2S:
+    case kFormat4S:
+      return kFormat4S;
+    case kFormatD:
+    case kFormat1D:
+    case kFormat2D:
+      return kFormat2D;
+    default:
+      UNREACHABLE();
+  }
+}
+
+VectorFormat VectorFormatHalfWidthDoubleLanes(VectorFormat vform) {
+  switch (vform) {
+    case kFormat4H:
+      return kFormat8B;
+    case kFormat8H:
+      return kFormat16B;
+    case kFormat2S:
+      return kFormat4H;
+    case kFormat4S:
+      return kFormat8H;
+    case kFormat1D:
+      return kFormat2S;
+    case kFormat2D:
+      return kFormat4S;
+    default:
+      UNREACHABLE();
+  }
+}
+
+VectorFormat VectorFormatDoubleLanes(VectorFormat vform) {
+  DCHECK(vform == kFormat8B || vform == kFormat4H || vform == kFormat2S);
+  switch (vform) {
+    case kFormat8B:
+      return kFormat16B;
+    case kFormat4H:
+      return kFormat8H;
+    case kFormat2S:
+      return kFormat4S;
+    default:
+      UNREACHABLE();
+  }
+}
+
+VectorFormat VectorFormatHalfLanes(VectorFormat vform) {
+  DCHECK(vform == kFormat16B || vform == kFormat8H || vform == kFormat4S);
+  switch (vform) {
+    case kFormat16B:
+      return kFormat8B;
+    case kFormat8H:
+      return kFormat4H;
+    case kFormat4S:
+      return kFormat2S;
+    default:
+      UNREACHABLE();
+  }
+}
+
+VectorFormat ScalarFormatFromLaneSize(int laneSize) {
+  switch (laneSize) {
+    case 8:
+      return kFormatB;
+    case 16:
+      return kFormatH;
+    case 32:
+      return kFormatS;
+    case 64:
+      return kFormatD;
+    default:
+      UNREACHABLE();
+  }
+}
+
+VectorFormat ScalarFormatFromFormat(VectorFormat vform) {
+  return ScalarFormatFromLaneSize(LaneSizeInBitsFromFormat(vform));
+}
+
+unsigned RegisterSizeInBytesFromFormat(VectorFormat vform) {
+  return RegisterSizeInBitsFromFormat(vform) / 8;
+}
+
+unsigned RegisterSizeInBitsFromFormat(VectorFormat vform) {
+  DCHECK_NE(vform, kFormatUndefined);
+  switch (vform) {
+    case kFormatB:
+      return kBRegSizeInBits;
+    case kFormatH:
+      return kHRegSizeInBits;
+    case kFormatS:
+      return kSRegSizeInBits;
+    case kFormatD:
+      return kDRegSizeInBits;
+    case kFormat8B:
+    case kFormat4H:
+    case kFormat2S:
+    case kFormat1D:
+      return kDRegSizeInBits;
+    default:
+      return kQRegSizeInBits;
+  }
+}
+
+unsigned LaneSizeInBitsFromFormat(VectorFormat vform) {
+  DCHECK_NE(vform, kFormatUndefined);
+  switch (vform) {
+    case kFormatB:
+    case kFormat8B:
+    case kFormat16B:
+      return 8;
+    case kFormatH:
+    case kFormat4H:
+    case kFormat8H:
+      return 16;
+    case kFormatS:
+    case kFormat2S:
+    case kFormat4S:
+      return 32;
+    case kFormatD:
+    case kFormat1D:
+    case kFormat2D:
+      return 64;
+    default:
+      UNREACHABLE();
+  }
+}
+
+int LaneSizeInBytesFromFormat(VectorFormat vform) {
+  return LaneSizeInBitsFromFormat(vform) / 8;
+}
+
+int LaneSizeInBytesLog2FromFormat(VectorFormat vform) {
+  DCHECK_NE(vform, kFormatUndefined);
+  switch (vform) {
+    case kFormatB:
+    case kFormat8B:
+    case kFormat16B:
+      return 0;
+    case kFormatH:
+    case kFormat4H:
+    case kFormat8H:
+      return 1;
+    case kFormatS:
+    case kFormat2S:
+    case kFormat4S:
+      return 2;
+    case kFormatD:
+    case kFormat1D:
+    case kFormat2D:
+      return 3;
+    default:
+      UNREACHABLE();
+  }
+}
+
+int LaneCountFromFormat(VectorFormat vform) {
+  DCHECK_NE(vform, kFormatUndefined);
+  switch (vform) {
+    case kFormat16B:
+      return 16;
+    case kFormat8B:
+    case kFormat8H:
+      return 8;
+    case kFormat4H:
+    case kFormat4S:
+      return 4;
+    case kFormat2S:
+    case kFormat2D:
+      return 2;
+    case kFormat1D:
+    case kFormatB:
+    case kFormatH:
+    case kFormatS:
+    case kFormatD:
+      return 1;
+    default:
+      UNREACHABLE();
+  }
+}
+
+int MaxLaneCountFromFormat(VectorFormat vform) {
+  DCHECK_NE(vform, kFormatUndefined);
+  switch (vform) {
+    case kFormatB:
+    case kFormat8B:
+    case kFormat16B:
+      return 16;
+    case kFormatH:
+    case kFormat4H:
+    case kFormat8H:
+      return 8;
+    case kFormatS:
+    case kFormat2S:
+    case kFormat4S:
+      return 4;
+    case kFormatD:
+    case kFormat1D:
+    case kFormat2D:
+      return 2;
+    default:
+      UNREACHABLE();
+  }
+}
+
+// Does 'vform' indicate a vector format or a scalar format?
+bool IsVectorFormat(VectorFormat vform) {
+  DCHECK_NE(vform, kFormatUndefined);
+  switch (vform) {
+    case kFormatB:
+    case kFormatH:
+    case kFormatS:
+    case kFormatD:
+      return false;
+    default:
+      return true;
+  }
+}
+
+int64_t MaxIntFromFormat(VectorFormat vform) {
+  return INT64_MAX >> (64 - LaneSizeInBitsFromFormat(vform));
+}
+
+int64_t MinIntFromFormat(VectorFormat vform) {
+  return INT64_MIN >> (64 - LaneSizeInBitsFromFormat(vform));
+}
+
+uint64_t MaxUintFromFormat(VectorFormat vform) {
+  return UINT64_MAX >> (64 - LaneSizeInBitsFromFormat(vform));
+}
+
+NEONFormatDecoder::NEONFormatDecoder(const Instruction* instr) {
+  instrbits_ = instr->InstructionBits();
+  SetFormatMaps(IntegerFormatMap());
+}
+
+NEONFormatDecoder::NEONFormatDecoder(const Instruction* instr,
+                                     const NEONFormatMap* format) {
+  instrbits_ = instr->InstructionBits();
+  SetFormatMaps(format);
+}
+
+NEONFormatDecoder::NEONFormatDecoder(const Instruction* instr,
+                                     const NEONFormatMap* format0,
+                                     const NEONFormatMap* format1) {
+  instrbits_ = instr->InstructionBits();
+  SetFormatMaps(format0, format1);
+}
+
+NEONFormatDecoder::NEONFormatDecoder(const Instruction* instr,
+                                     const NEONFormatMap* format0,
+                                     const NEONFormatMap* format1,
+                                     const NEONFormatMap* format2) {
+  instrbits_ = instr->InstructionBits();
+  SetFormatMaps(format0, format1, format2);
+}
+
+void NEONFormatDecoder::SetFormatMaps(const NEONFormatMap* format0,
+                                      const NEONFormatMap* format1,
+                                      const NEONFormatMap* format2) {
+  DCHECK_NOT_NULL(format0);
+  formats_[0] = format0;
+  formats_[1] = (format1 == NULL) ? formats_[0] : format1;
+  formats_[2] = (format2 == NULL) ? formats_[1] : format2;
+}
+
+void NEONFormatDecoder::SetFormatMap(unsigned index,
+                                     const NEONFormatMap* format) {
+  DCHECK_LT(index, arraysize(formats_));
+  DCHECK_NOT_NULL(format);
+  formats_[index] = format;
+}
+
+const char* NEONFormatDecoder::SubstitutePlaceholders(const char* string) {
+  return Substitute(string, kPlaceholder, kPlaceholder, kPlaceholder);
+}
+
+const char* NEONFormatDecoder::Substitute(const char* string,
+                                          SubstitutionMode mode0,
+                                          SubstitutionMode mode1,
+                                          SubstitutionMode mode2) {
+  snprintf(form_buffer_, sizeof(form_buffer_), string, GetSubstitute(0, mode0),
+           GetSubstitute(1, mode1), GetSubstitute(2, mode2));
+  return form_buffer_;
+}
+
+const char* NEONFormatDecoder::Mnemonic(const char* mnemonic) {
+  if ((instrbits_ & NEON_Q) != 0) {
+    snprintf(mne_buffer_, sizeof(mne_buffer_), "%s2", mnemonic);
+    return mne_buffer_;
+  }
+  return mnemonic;
+}
+
+VectorFormat NEONFormatDecoder::GetVectorFormat(int format_index) {
+  return GetVectorFormat(formats_[format_index]);
+}
+
+VectorFormat NEONFormatDecoder::GetVectorFormat(
+    const NEONFormatMap* format_map) {
+  static const VectorFormat vform[] = {
+      kFormatUndefined, kFormat8B, kFormat16B, kFormat4H, kFormat8H,
+      kFormat2S,        kFormat4S, kFormat1D,  kFormat2D, kFormatB,
+      kFormatH,         kFormatS,  kFormatD};
+  DCHECK_LT(GetNEONFormat(format_map), arraysize(vform));
+  return vform[GetNEONFormat(format_map)];
+}
+
+const char* NEONFormatDecoder::GetSubstitute(int index, SubstitutionMode mode) {
+  if (mode == kFormat) {
+    return NEONFormatAsString(GetNEONFormat(formats_[index]));
+  }
+  DCHECK_EQ(mode, kPlaceholder);
+  return NEONFormatAsPlaceholder(GetNEONFormat(formats_[index]));
+}
+
+NEONFormat NEONFormatDecoder::GetNEONFormat(const NEONFormatMap* format_map) {
+  return format_map->map[PickBits(format_map->bits)];
+}
+
+const char* NEONFormatDecoder::NEONFormatAsString(NEONFormat format) {
+  static const char* formats[] = {"undefined", "8b", "16b", "4h", "8h",
+                                  "2s",        "4s", "1d",  "2d", "b",
+                                  "h",         "s",  "d"};
+  DCHECK_LT(format, arraysize(formats));
+  return formats[format];
+}
+
+const char* NEONFormatDecoder::NEONFormatAsPlaceholder(NEONFormat format) {
+  DCHECK((format == NF_B) || (format == NF_H) || (format == NF_S) ||
+         (format == NF_D) || (format == NF_UNDEF));
+  static const char* formats[] = {
+      "undefined", "undefined", "undefined", "undefined", "undefined",
+      "undefined", "undefined", "undefined", "undefined", "'B",
+      "'H",        "'S",        "'D"};
+  return formats[format];
+}
+
+uint8_t NEONFormatDecoder::PickBits(const uint8_t bits[]) {
+  uint8_t result = 0;
+  for (unsigned b = 0; b < kNEONFormatMaxBits; b++) {
+    if (bits[b] == 0) break;
+    result <<= 1;
+    result |= ((instrbits_ & (1 << bits[b])) == 0) ? 0 : 1;
+  }
+  return result;
+}
 }  // namespace internal
 }  // namespace v8

--- a/src/arm64/instructions-arm64.h
+++ b/src/arm64/instructions-arm64.h
@ -23,13 +23,17 @@ typedef uint32_t Instr;
 // symbol is defined as uint32_t/uint64_t initialized with the desired bit
 // pattern. Otherwise, the same symbol is declared as an external float/double.
 #if defined(ARM64_DEFINE_FP_STATICS)
+#define DEFINE_FLOAT16(name, value) extern const uint16_t name = value
 #define DEFINE_FLOAT(name, value) extern const uint32_t name = value
 #define DEFINE_DOUBLE(name, value) extern const uint64_t name = value
 #else
+#define DEFINE_FLOAT16(name, value) extern const float16 name
 #define DEFINE_FLOAT(name, value) extern const float name
 #define DEFINE_DOUBLE(name, value) extern const double name
 #endif  // defined(ARM64_DEFINE_FP_STATICS)

+DEFINE_FLOAT16(kFP16PositiveInfinity, 0x7c00);
+DEFINE_FLOAT16(kFP16NegativeInfinity, 0xfc00);
 DEFINE_FLOAT(kFP32PositiveInfinity, 0x7f800000);
 DEFINE_FLOAT(kFP32NegativeInfinity, 0xff800000);
 DEFINE_DOUBLE(kFP64PositiveInfinity, 0x7ff0000000000000UL);
@ -47,19 +51,14 @@ DEFINE_FLOAT(kFP32QuietNaN, 0x7fc00001);
 // The default NaN values (for FPCR.DN=1).
 DEFINE_DOUBLE(kFP64DefaultNaN, 0x7ff8000000000000UL);
 DEFINE_FLOAT(kFP32DefaultNaN, 0x7fc00000);
+DEFINE_FLOAT16(kFP16DefaultNaN, 0x7e00);

+#undef DEFINE_FLOAT16
 #undef DEFINE_FLOAT
 #undef DEFINE_DOUBLE

-
-enum LSDataSize {
-  LSByte        = 0,
-  LSHalfword    = 1,
-  LSWord        = 2,
-  LSDoubleWord  = 3
-};
-
-LSDataSize CalcLSPairDataSize(LoadStorePairOp op);
+unsigned CalcLSDataSize(LoadStoreOp op);
+unsigned CalcLSPairDataSize(LoadStorePairOp op);

 enum ImmBranchType {
  UnknownBranchType = 0,
@ -82,9 +81,10 @@ enum FPRounding {
  FPNegativeInfinity = 0x2,
  FPZero = 0x3,

-  // The final rounding mode is only available when explicitly specified by the
-  // instruction (such as with fcvta). It cannot be set in FPCR.
-  FPTieAway
+  // The final rounding modes are only available when explicitly specified by
+  // the instruction (such as with fcvta). They cannot be set in FPCR.
+  FPTieAway,
+  FPRoundOdd
 };

 enum Reg31Mode {
@ -152,14 +152,29 @@ class Instruction {
  }

  uint64_t ImmLogical();
+  unsigned ImmNEONabcdefgh() const;
  float ImmFP32();
  double ImmFP64();
+  float ImmNEONFP32() const;
+  double ImmNEONFP64() const;

-  LSDataSize SizeLSPair() const {
+  unsigned SizeLS() const {
+    return CalcLSDataSize(static_cast<LoadStoreOp>(Mask(LoadStoreMask)));
+  }
+
+  unsigned SizeLSPair() const {
    return CalcLSPairDataSize(
        static_cast<LoadStorePairOp>(Mask(LoadStorePairMask)));
  }

+  int NEONLSIndex(int access_size_shift) const {
+    int q = NEONQ();
+    int s = NEONS();
+    int size = NEONLSSize();
+    int index = (q << 3) | (s << 2) | size;
+    return index >> access_size_shift;
+  }
+
  // Helpers.
  bool IsCondBranchImm() const {
    return Mask(ConditionalBranchFMask) == ConditionalBranchFixed;
@ -181,6 +196,33 @@ class Instruction {
    return BranchType() != UnknownBranchType;
  }

+  static float Imm8ToFP32(uint32_t imm8) {
+    //   Imm8: abcdefgh (8 bits)
+    // Single: aBbb.bbbc.defg.h000.0000.0000.0000.0000 (32 bits)
+    // where B is b ^ 1
+    uint32_t bits = imm8;
+    uint32_t bit7 = (bits >> 7) & 0x1;
+    uint32_t bit6 = (bits >> 6) & 0x1;
+    uint32_t bit5_to_0 = bits & 0x3f;
+    uint32_t result = (bit7 << 31) | ((32 - bit6) << 25) | (bit5_to_0 << 19);
+
+    return bit_cast<float>(result);
+  }
+
+  static double Imm8ToFP64(uint32_t imm8) {
+    //   Imm8: abcdefgh (8 bits)
+    // Double: aBbb.bbbb.bbcd.efgh.0000.0000.0000.0000
+    //         0000.0000.0000.0000.0000.0000.0000.0000 (64 bits)
+    // where B is b ^ 1
+    uint32_t bits = imm8;
+    uint64_t bit7 = (bits >> 7) & 0x1;
+    uint64_t bit6 = (bits >> 6) & 0x1;
+    uint64_t bit5_to_0 = bits & 0x3f;
+    uint64_t result = (bit7 << 63) | ((256 - bit6) << 54) | (bit5_to_0 << 48);
+
+    return bit_cast<double>(result);
+  }
+
  bool IsLdrLiteral() const {
    return Mask(LoadLiteralFMask) == LoadLiteralFixed;
  }
@ -416,6 +458,48 @@ class Instruction {
  void SetBranchImmTarget(Instruction* target);
 };

+// Functions for handling NEON vector format information.
+enum VectorFormat {
+  kFormatUndefined = 0xffffffff,
+  kFormat8B = NEON_8B,
+  kFormat16B = NEON_16B,
+  kFormat4H = NEON_4H,
+  kFormat8H = NEON_8H,
+  kFormat2S = NEON_2S,
+  kFormat4S = NEON_4S,
+  kFormat1D = NEON_1D,
+  kFormat2D = NEON_2D,
+
+  // Scalar formats. We add the scalar bit to distinguish between scalar and
+  // vector enumerations; the bit is always set in the encoding of scalar ops
+  // and always clear for vector ops. Although kFormatD and kFormat1D appear
+  // to be the same, their meaning is subtly different. The first is a scalar
+  // operation, the second a vector operation that only affects one lane.
+  kFormatB = NEON_B | NEONScalar,
+  kFormatH = NEON_H | NEONScalar,
+  kFormatS = NEON_S | NEONScalar,
+  kFormatD = NEON_D | NEONScalar
+};
+
+VectorFormat VectorFormatHalfWidth(VectorFormat vform);
+VectorFormat VectorFormatDoubleWidth(VectorFormat vform);
+VectorFormat VectorFormatDoubleLanes(VectorFormat vform);
+VectorFormat VectorFormatHalfLanes(VectorFormat vform);
+VectorFormat ScalarFormatFromLaneSize(int lanesize);
+VectorFormat VectorFormatHalfWidthDoubleLanes(VectorFormat vform);
+VectorFormat VectorFormatFillQ(VectorFormat vform);
+VectorFormat ScalarFormatFromFormat(VectorFormat vform);
+unsigned RegisterSizeInBitsFromFormat(VectorFormat vform);
+unsigned RegisterSizeInBytesFromFormat(VectorFormat vform);
+int LaneSizeInBytesFromFormat(VectorFormat vform);
+unsigned LaneSizeInBitsFromFormat(VectorFormat vform);
+int LaneSizeInBytesLog2FromFormat(VectorFormat vform);
+int LaneCountFromFormat(VectorFormat vform);
+int MaxLaneCountFromFormat(VectorFormat vform);
+bool IsVectorFormat(VectorFormat vform);
+int64_t MaxIntFromFormat(VectorFormat vform);
+int64_t MinIntFromFormat(VectorFormat vform);
+uint64_t MaxUintFromFormat(VectorFormat vform);

 // Where Instruction looks at instructions generated by the Assembler,
 // InstructionSequence looks at instructions sequences generated by the
@ -503,7 +587,7 @@ const unsigned kDebugMessageOffset = 3 * kInstructionSize;
 //
 // For example:
 //
-// __ debug("print registers and fp registers", 0, LOG_REGS | LOG_FP_REGS);
+// __ debug("print registers and fp registers", 0, LOG_REGS | LOG_VREGS);
 // will print the registers and fp registers only once.
 //
 // __ debug("trace disasm", 1, TRACE_ENABLE | LOG_DISASM);
@ -516,24 +600,201 @@ const unsigned kDebugMessageOffset = 3 * kInstructionSize;
 // stops tracing the registers.
 const unsigned kDebuggerTracingDirectivesMask = 3 << 6;
 enum DebugParameters {
-  NO_PARAM       = 0,
-  BREAK          = 1 << 0,
-  LOG_DISASM     = 1 << 1,  // Use only with TRACE. Disassemble the code.
-  LOG_REGS       = 1 << 2,  // Log general purpose registers.
-  LOG_FP_REGS    = 1 << 3,  // Log floating-point registers.
-  LOG_SYS_REGS   = 1 << 4,  // Log the status flags.
-  LOG_WRITE      = 1 << 5,  // Log any memory write.
+  NO_PARAM = 0,
+  BREAK = 1 << 0,
+  LOG_DISASM = 1 << 1,    // Use only with TRACE. Disassemble the code.
+  LOG_REGS = 1 << 2,      // Log general purpose registers.
+  LOG_VREGS = 1 << 3,     // Log NEON and floating-point registers.
+  LOG_SYS_REGS = 1 << 4,  // Log the status flags.
+  LOG_WRITE = 1 << 5,     // Log any memory write.

-  LOG_STATE      = LOG_REGS | LOG_FP_REGS | LOG_SYS_REGS,
-  LOG_ALL        = LOG_DISASM | LOG_STATE | LOG_WRITE,
+  LOG_NONE = 0,
+  LOG_STATE = LOG_REGS | LOG_VREGS | LOG_SYS_REGS,
+  LOG_ALL = LOG_DISASM | LOG_STATE | LOG_WRITE,

  // Trace control.
-  TRACE_ENABLE   = 1 << 6,
-  TRACE_DISABLE  = 2 << 6,
+  TRACE_ENABLE = 1 << 6,
+  TRACE_DISABLE = 2 << 6,
  TRACE_OVERRIDE = 3 << 6
 };

+enum NEONFormat {
+  NF_UNDEF = 0,
+  NF_8B = 1,
+  NF_16B = 2,
+  NF_4H = 3,
+  NF_8H = 4,
+  NF_2S = 5,
+  NF_4S = 6,
+  NF_1D = 7,
+  NF_2D = 8,
+  NF_B = 9,
+  NF_H = 10,
+  NF_S = 11,
+  NF_D = 12
+};

+static const unsigned kNEONFormatMaxBits = 6;
+
+struct NEONFormatMap {
+  // The bit positions in the instruction to consider.
+  uint8_t bits[kNEONFormatMaxBits];
+
+  // Mapping from concatenated bits to format.
+  NEONFormat map[1 << kNEONFormatMaxBits];
+};
+
+class NEONFormatDecoder {
+ public:
+  enum SubstitutionMode { kPlaceholder, kFormat };
+
+  // Construct a format decoder with increasingly specific format maps for each
+  // substitution. If no format map is specified, the default is the integer
+  // format map.
+  explicit NEONFormatDecoder(const Instruction* instr);
+  NEONFormatDecoder(const Instruction* instr, const NEONFormatMap* format);
+  NEONFormatDecoder(const Instruction* instr, const NEONFormatMap* format0,
+                    const NEONFormatMap* format1);
+  NEONFormatDecoder(const Instruction* instr, const NEONFormatMap* format0,
+                    const NEONFormatMap* format1, const NEONFormatMap* format2);
+
+  // Set the format mapping for all or individual substitutions.
+  void SetFormatMaps(const NEONFormatMap* format0,
+                     const NEONFormatMap* format1 = NULL,
+                     const NEONFormatMap* format2 = NULL);
+  void SetFormatMap(unsigned index, const NEONFormatMap* format);
+
+  // Substitute %s in the input string with the placeholder string for each
+  // register, ie. "'B", "'H", etc.
+  const char* SubstitutePlaceholders(const char* string);
+
+  // Substitute %s in the input string with a new string based on the
+  // substitution mode.
+  const char* Substitute(const char* string, SubstitutionMode mode0 = kFormat,
+                         SubstitutionMode mode1 = kFormat,
+                         SubstitutionMode mode2 = kFormat);
+
+  // Append a "2" to a mnemonic string based of the state of the Q bit.
+  const char* Mnemonic(const char* mnemonic);
+
+  VectorFormat GetVectorFormat(int format_index = 0);
+  VectorFormat GetVectorFormat(const NEONFormatMap* format_map);
+
+  // Built in mappings for common cases.
+
+  // The integer format map uses three bits (Q, size<1:0>) to encode the
+  // "standard" set of NEON integer vector formats.
+  static const NEONFormatMap* IntegerFormatMap() {
+    static const NEONFormatMap map = {
+        {23, 22, 30},
+        {NF_8B, NF_16B, NF_4H, NF_8H, NF_2S, NF_4S, NF_UNDEF, NF_2D}};
+    return &map;
+  }
+
+  // The long integer format map uses two bits (size<1:0>) to encode the
+  // long set of NEON integer vector formats. These are used in narrow, wide
+  // and long operations.
+  static const NEONFormatMap* LongIntegerFormatMap() {
+    static const NEONFormatMap map = {{23, 22}, {NF_8H, NF_4S, NF_2D}};
+    return &map;
+  }
+
+  // The FP format map uses two bits (Q, size<0>) to encode the NEON FP vector
+  // formats: NF_2S, NF_4S, NF_2D.
+  static const NEONFormatMap* FPFormatMap() {
+    // The FP format map assumes two bits (Q, size<0>) are used to encode the
+    // NEON FP vector formats: NF_2S, NF_4S, NF_2D.
+    static const NEONFormatMap map = {{22, 30},
+                                      {NF_2S, NF_4S, NF_UNDEF, NF_2D}};
+    return &map;
+  }
+
+  // The load/store format map uses three bits (Q, 11, 10) to encode the
+  // set of NEON vector formats.
+  static const NEONFormatMap* LoadStoreFormatMap() {
+    static const NEONFormatMap map = {
+        {11, 10, 30},
+        {NF_8B, NF_16B, NF_4H, NF_8H, NF_2S, NF_4S, NF_1D, NF_2D}};
+    return &map;
+  }
+
+  // The logical format map uses one bit (Q) to encode the NEON vector format:
+  // NF_8B, NF_16B.
+  static const NEONFormatMap* LogicalFormatMap() {
+    static const NEONFormatMap map = {{30}, {NF_8B, NF_16B}};
+    return &map;
+  }
+
+  // The triangular format map uses between two and five bits to encode the NEON
+  // vector format:
+  // xxx10->8B, xxx11->16B, xx100->4H, xx101->8H
+  // x1000->2S, x1001->4S,  10001->2D, all others undefined.
+  static const NEONFormatMap* TriangularFormatMap() {
+    static const NEONFormatMap map = {
+        {19, 18, 17, 16, 30},
+        {NF_UNDEF, NF_UNDEF, NF_8B, NF_16B, NF_4H, NF_8H, NF_8B, NF_16B,
+         NF_2S,    NF_4S,    NF_8B, NF_16B, NF_4H, NF_8H, NF_8B, NF_16B,
+         NF_UNDEF, NF_2D,    NF_8B, NF_16B, NF_4H, NF_8H, NF_8B, NF_16B,
+         NF_2S,    NF_4S,    NF_8B, NF_16B, NF_4H, NF_8H, NF_8B, NF_16B}};
+    return &map;
+  }
+
+  // The scalar format map uses two bits (size<1:0>) to encode the NEON scalar
+  // formats: NF_B, NF_H, NF_S, NF_D.
+  static const NEONFormatMap* ScalarFormatMap() {
+    static const NEONFormatMap map = {{23, 22}, {NF_B, NF_H, NF_S, NF_D}};
+    return &map;
+  }
+
+  // The long scalar format map uses two bits (size<1:0>) to encode the longer
+  // NEON scalar formats: NF_H, NF_S, NF_D.
+  static const NEONFormatMap* LongScalarFormatMap() {
+    static const NEONFormatMap map = {{23, 22}, {NF_H, NF_S, NF_D}};
+    return &map;
+  }
+
+  // The FP scalar format map assumes one bit (size<0>) is used to encode the
+  // NEON FP scalar formats: NF_S, NF_D.
+  static const NEONFormatMap* FPScalarFormatMap() {
+    static const NEONFormatMap map = {{22}, {NF_S, NF_D}};
+    return &map;
+  }
+
+  // The triangular scalar format map uses between one and four bits to encode
+  // the NEON FP scalar formats:
+  // xxx1->B, xx10->H, x100->S, 1000->D, all others undefined.
+  static const NEONFormatMap* TriangularScalarFormatMap() {
+    static const NEONFormatMap map = {
+        {19, 18, 17, 16},
+        {NF_UNDEF, NF_B, NF_H, NF_B, NF_S, NF_B, NF_H, NF_B, NF_D, NF_B, NF_H,
+         NF_B, NF_S, NF_B, NF_H, NF_B}};
+    return &map;
+  }
+
+ private:
+  // Get a pointer to a string that represents the format or placeholder for
+  // the specified substitution index, based on the format map and instruction.
+  const char* GetSubstitute(int index, SubstitutionMode mode);
+
+  // Get the NEONFormat enumerated value for bits obtained from the
+  // instruction based on the specified format mapping.
+  NEONFormat GetNEONFormat(const NEONFormatMap* format_map);
+
+  // Convert a NEONFormat into a string.
+  static const char* NEONFormatAsString(NEONFormat format);
+
+  // Convert a NEONFormat into a register placeholder string.
+  static const char* NEONFormatAsPlaceholder(NEONFormat format);
+
+  // Select bits from instrbits_ defined by the bits array, concatenate them,
+  // and return the value.
+  uint8_t PickBits(const uint8_t bits[]);
+
+  Instr instrbits_;
+  const NEONFormatMap* formats_[3];
+  char form_buffer_[64];
+  char mne_buffer_[16];
+};
 }  // namespace internal
 }  // namespace v8

--- a/src/arm64/instrument-arm64.cc
+++ b/src/arm64/instrument-arm64.cc
@ -377,7 +377,7 @@ void Instrument::InstrumentLoadStore(Instruction* instr) {
  static Counter* load_fp_counter = GetCounter("Load FP");
  static Counter* store_fp_counter = GetCounter("Store FP");

-  switch (instr->Mask(LoadStoreOpMask)) {
+  switch (instr->Mask(LoadStoreMask)) {
    case STRB_w:    // Fall through.
    case STRH_w:    // Fall through.
    case STR_w:     // Fall through.
@ -595,6 +595,159 @@ void Instrument::VisitFPFixedPointConvert(Instruction* instr) {
  counter->Increment();
 }

+void Instrument::VisitNEON2RegMisc(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEON3Different(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEON3Same(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONAcrossLanes(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONByIndexedElement(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONCopy(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONExtract(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONLoadStoreMultiStruct(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONLoadStoreMultiStructPostIndex(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONLoadStoreSingleStruct(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONLoadStoreSingleStructPostIndex(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONModifiedImmediate(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONPerm(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONScalar2RegMisc(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONScalar3Diff(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONScalar3Same(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONScalarByIndexedElement(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONScalarCopy(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONScalarPairwise(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONScalarShiftImmediate(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONShiftImmediate(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+void Instrument::VisitNEONTable(Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}

 void Instrument::VisitUnallocated(Instruction* instr) {
  Update();
--- a/src/arm64/macro-assembler-arm64-inl.h
+++ b/src/arm64/macro-assembler-arm64-inl.h
@ -547,42 +547,34 @@ void MacroAssembler::Extr(const Register& rd,
  extr(rd, rn, rm, lsb);
 }

-
-void MacroAssembler::Fabs(const FPRegister& fd, const FPRegister& fn) {
+void MacroAssembler::Fabs(const VRegister& fd, const VRegister& fn) {
  DCHECK(allow_macro_instructions_);
  fabs(fd, fn);
 }

-
-void MacroAssembler::Fadd(const FPRegister& fd,
-                          const FPRegister& fn,
-                          const FPRegister& fm) {
+void MacroAssembler::Fadd(const VRegister& fd, const VRegister& fn,
+                          const VRegister& fm) {
  DCHECK(allow_macro_instructions_);
  fadd(fd, fn, fm);
 }

-
-void MacroAssembler::Fccmp(const FPRegister& fn,
-                           const FPRegister& fm,
-                           StatusFlags nzcv,
-                           Condition cond) {
+void MacroAssembler::Fccmp(const VRegister& fn, const VRegister& fm,
+                           StatusFlags nzcv, Condition cond) {
  DCHECK(allow_macro_instructions_);
  DCHECK((cond != al) && (cond != nv));
  fccmp(fn, fm, nzcv, cond);
 }

-
-void MacroAssembler::Fcmp(const FPRegister& fn, const FPRegister& fm) {
+void MacroAssembler::Fcmp(const VRegister& fn, const VRegister& fm) {
  DCHECK(allow_macro_instructions_);
  fcmp(fn, fm);
 }

-
-void MacroAssembler::Fcmp(const FPRegister& fn, double value) {
+void MacroAssembler::Fcmp(const VRegister& fn, double value) {
  DCHECK(allow_macro_instructions_);
  if (value != 0.0) {
    UseScratchRegisterScope temps(this);
-    FPRegister tmp = temps.AcquireSameSizeAs(fn);
+    VRegister tmp = temps.AcquireSameSizeAs(fn);
    Fmov(tmp, value);
    fcmp(fn, tmp);
  } else {
@ -590,271 +582,204 @@ void MacroAssembler::Fcmp(const FPRegister& fn, double value) {
  }
 }

-
-void MacroAssembler::Fcsel(const FPRegister& fd,
-                           const FPRegister& fn,
-                           const FPRegister& fm,
-                           Condition cond) {
+void MacroAssembler::Fcsel(const VRegister& fd, const VRegister& fn,
+                           const VRegister& fm, Condition cond) {
  DCHECK(allow_macro_instructions_);
  DCHECK((cond != al) && (cond != nv));
  fcsel(fd, fn, fm, cond);
 }

-
-void MacroAssembler::Fcvt(const FPRegister& fd, const FPRegister& fn) {
+void MacroAssembler::Fcvt(const VRegister& fd, const VRegister& fn) {
  DCHECK(allow_macro_instructions_);
  fcvt(fd, fn);
 }

-
-void MacroAssembler::Fcvtas(const Register& rd, const FPRegister& fn) {
+void MacroAssembler::Fcvtas(const Register& rd, const VRegister& fn) {
  DCHECK(allow_macro_instructions_);
  DCHECK(!rd.IsZero());
  fcvtas(rd, fn);
 }

-
-void MacroAssembler::Fcvtau(const Register& rd, const FPRegister& fn) {
+void MacroAssembler::Fcvtau(const Register& rd, const VRegister& fn) {
  DCHECK(allow_macro_instructions_);
  DCHECK(!rd.IsZero());
  fcvtau(rd, fn);
 }

-
-void MacroAssembler::Fcvtms(const Register& rd, const FPRegister& fn) {
+void MacroAssembler::Fcvtms(const Register& rd, const VRegister& fn) {
  DCHECK(allow_macro_instructions_);
  DCHECK(!rd.IsZero());
  fcvtms(rd, fn);
 }

-
-void MacroAssembler::Fcvtmu(const Register& rd, const FPRegister& fn) {
+void MacroAssembler::Fcvtmu(const Register& rd, const VRegister& fn) {
  DCHECK(allow_macro_instructions_);
  DCHECK(!rd.IsZero());
  fcvtmu(rd, fn);
 }

-
-void MacroAssembler::Fcvtns(const Register& rd, const FPRegister& fn) {
+void MacroAssembler::Fcvtns(const Register& rd, const VRegister& fn) {
  DCHECK(allow_macro_instructions_);
  DCHECK(!rd.IsZero());
  fcvtns(rd, fn);
 }

-
-void MacroAssembler::Fcvtnu(const Register& rd, const FPRegister& fn) {
+void MacroAssembler::Fcvtnu(const Register& rd, const VRegister& fn) {
  DCHECK(allow_macro_instructions_);
  DCHECK(!rd.IsZero());
  fcvtnu(rd, fn);
 }

-
-void MacroAssembler::Fcvtzs(const Register& rd, const FPRegister& fn) {
+void MacroAssembler::Fcvtzs(const Register& rd, const VRegister& fn) {
  DCHECK(allow_macro_instructions_);
  DCHECK(!rd.IsZero());
  fcvtzs(rd, fn);
 }
-void MacroAssembler::Fcvtzu(const Register& rd, const FPRegister& fn) {
+void MacroAssembler::Fcvtzu(const Register& rd, const VRegister& fn) {
  DCHECK(allow_macro_instructions_);
  DCHECK(!rd.IsZero());
  fcvtzu(rd, fn);
 }

-
-void MacroAssembler::Fdiv(const FPRegister& fd,
-                          const FPRegister& fn,
-                          const FPRegister& fm) {
+void MacroAssembler::Fdiv(const VRegister& fd, const VRegister& fn,
+                          const VRegister& fm) {
  DCHECK(allow_macro_instructions_);
  fdiv(fd, fn, fm);
 }

-
-void MacroAssembler::Fmadd(const FPRegister& fd,
-                           const FPRegister& fn,
-                           const FPRegister& fm,
-                           const FPRegister& fa) {
+void MacroAssembler::Fmadd(const VRegister& fd, const VRegister& fn,
+                           const VRegister& fm, const VRegister& fa) {
  DCHECK(allow_macro_instructions_);
  fmadd(fd, fn, fm, fa);
 }

-
-void MacroAssembler::Fmax(const FPRegister& fd,
-                          const FPRegister& fn,
-                          const FPRegister& fm) {
+void MacroAssembler::Fmax(const VRegister& fd, const VRegister& fn,
+                          const VRegister& fm) {
  DCHECK(allow_macro_instructions_);
  fmax(fd, fn, fm);
 }

-
-void MacroAssembler::Fmaxnm(const FPRegister& fd,
-                            const FPRegister& fn,
-                            const FPRegister& fm) {
+void MacroAssembler::Fmaxnm(const VRegister& fd, const VRegister& fn,
+                            const VRegister& fm) {
  DCHECK(allow_macro_instructions_);
  fmaxnm(fd, fn, fm);
 }

-
-void MacroAssembler::Fmin(const FPRegister& fd,
-                          const FPRegister& fn,
-                          const FPRegister& fm) {
+void MacroAssembler::Fmin(const VRegister& fd, const VRegister& fn,
+                          const VRegister& fm) {
  DCHECK(allow_macro_instructions_);
  fmin(fd, fn, fm);
 }

-
-void MacroAssembler::Fminnm(const FPRegister& fd,
-                            const FPRegister& fn,
-                            const FPRegister& fm) {
+void MacroAssembler::Fminnm(const VRegister& fd, const VRegister& fn,
+                            const VRegister& fm) {
  DCHECK(allow_macro_instructions_);
  fminnm(fd, fn, fm);
 }

-
-void MacroAssembler::Fmov(FPRegister fd, FPRegister fn) {
+void MacroAssembler::Fmov(VRegister fd, VRegister fn) {
  DCHECK(allow_macro_instructions_);
  // Only emit an instruction if fd and fn are different, and they are both D
  // registers. fmov(s0, s0) is not a no-op because it clears the top word of
  // d0. Technically, fmov(d0, d0) is not a no-op either because it clears the
-  // top of q0, but FPRegister does not currently support Q registers.
+  // top of q0, but VRegister does not currently support Q registers.
  if (!fd.Is(fn) || !fd.Is64Bits()) {
    fmov(fd, fn);
  }
 }

-
-void MacroAssembler::Fmov(FPRegister fd, Register rn) {
+void MacroAssembler::Fmov(VRegister fd, Register rn) {
  DCHECK(allow_macro_instructions_);
  fmov(fd, rn);
 }

-
-void MacroAssembler::Fmov(FPRegister fd, double imm) {
+void MacroAssembler::Fmov(VRegister vd, double imm) {
  DCHECK(allow_macro_instructions_);
-  if (fd.Is32Bits()) {
-    Fmov(fd, static_cast<float>(imm));
+
+  if (vd.Is1S() || vd.Is2S() || vd.Is4S()) {
+    Fmov(vd, static_cast<float>(imm));
    return;
  }

-  DCHECK(fd.Is64Bits());
+  DCHECK(vd.Is1D() || vd.Is2D());
  if (IsImmFP64(imm)) {
-    fmov(fd, imm);
-  } else if ((imm == 0.0) && (copysign(1.0, imm) == 1.0)) {
-    fmov(fd, xzr);
+    fmov(vd, imm);
  } else {
-    Ldr(fd, imm);
+    uint64_t bits = bit_cast<uint64_t>(imm);
+    if (vd.IsScalar()) {
+      if (bits == 0) {
+        fmov(vd, xzr);
+      } else {
+        Ldr(vd, imm);
+      }
+    } else {
+      // TODO(all): consider NEON support for load literal.
+      Movi(vd, bits);
+    }
  }
 }

-
-void MacroAssembler::Fmov(FPRegister fd, float imm) {
+void MacroAssembler::Fmov(VRegister vd, float imm) {
  DCHECK(allow_macro_instructions_);
-  if (fd.Is64Bits()) {
-    Fmov(fd, static_cast<double>(imm));
+  if (vd.Is1D() || vd.Is2D()) {
+    Fmov(vd, static_cast<double>(imm));
    return;
  }

-  DCHECK(fd.Is32Bits());
+  DCHECK(vd.Is1S() || vd.Is2S() || vd.Is4S());
  if (IsImmFP32(imm)) {
-    fmov(fd, imm);
-  } else if ((imm == 0.0) && (copysign(1.0, imm) == 1.0)) {
-    fmov(fd, wzr);
+    fmov(vd, imm);
  } else {
-    UseScratchRegisterScope temps(this);
-    Register tmp = temps.AcquireW();
-    // TODO(all): Use Assembler::ldr(const FPRegister& ft, float imm).
-    Mov(tmp, float_to_rawbits(imm));
-    Fmov(fd, tmp);
+    uint32_t bits = bit_cast<uint32_t>(imm);
+    if (vd.IsScalar()) {
+      if (bits == 0) {
+        fmov(vd, wzr);
+      } else {
+        UseScratchRegisterScope temps(this);
+        Register tmp = temps.AcquireW();
+        // TODO(all): Use Assembler::ldr(const VRegister& ft, float imm).
+        Mov(tmp, bit_cast<uint32_t>(imm));
+        Fmov(vd, tmp);
+      }
+    } else {
+      // TODO(all): consider NEON support for load literal.
+      Movi(vd, bits);
+    }
  }
 }

-
-void MacroAssembler::Fmov(Register rd, FPRegister fn) {
+void MacroAssembler::Fmov(Register rd, VRegister fn) {
  DCHECK(allow_macro_instructions_);
  DCHECK(!rd.IsZero());
  fmov(rd, fn);
 }

-
-void MacroAssembler::Fmsub(const FPRegister& fd,
-                           const FPRegister& fn,
-                           const FPRegister& fm,
-                           const FPRegister& fa) {
+void MacroAssembler::Fmsub(const VRegister& fd, const VRegister& fn,
+                           const VRegister& fm, const VRegister& fa) {
  DCHECK(allow_macro_instructions_);
  fmsub(fd, fn, fm, fa);
 }

-
-void MacroAssembler::Fmul(const FPRegister& fd,
-                          const FPRegister& fn,
-                          const FPRegister& fm) {
+void MacroAssembler::Fmul(const VRegister& fd, const VRegister& fn,
+                          const VRegister& fm) {
  DCHECK(allow_macro_instructions_);
  fmul(fd, fn, fm);
 }

-
-void MacroAssembler::Fneg(const FPRegister& fd, const FPRegister& fn) {
-  DCHECK(allow_macro_instructions_);
-  fneg(fd, fn);
-}
-
-
-void MacroAssembler::Fnmadd(const FPRegister& fd,
-                            const FPRegister& fn,
-                            const FPRegister& fm,
-                            const FPRegister& fa) {
+void MacroAssembler::Fnmadd(const VRegister& fd, const VRegister& fn,
+                            const VRegister& fm, const VRegister& fa) {
  DCHECK(allow_macro_instructions_);
  fnmadd(fd, fn, fm, fa);
 }

-
-void MacroAssembler::Fnmsub(const FPRegister& fd,
-                            const FPRegister& fn,
-                            const FPRegister& fm,
-                            const FPRegister& fa) {
+void MacroAssembler::Fnmsub(const VRegister& fd, const VRegister& fn,
+                            const VRegister& fm, const VRegister& fa) {
  DCHECK(allow_macro_instructions_);
  fnmsub(fd, fn, fm, fa);
 }

-
-void MacroAssembler::Frinta(const FPRegister& fd, const FPRegister& fn) {
-  DCHECK(allow_macro_instructions_);
-  frinta(fd, fn);
-}
-
-
-void MacroAssembler::Frintm(const FPRegister& fd, const FPRegister& fn) {
-  DCHECK(allow_macro_instructions_);
-  frintm(fd, fn);
-}
-
-
-void MacroAssembler::Frintn(const FPRegister& fd, const FPRegister& fn) {
-  DCHECK(allow_macro_instructions_);
-  frintn(fd, fn);
-}
-
-
-void MacroAssembler::Frintp(const FPRegister& fd, const FPRegister& fn) {
-  DCHECK(allow_macro_instructions_);
-  frintp(fd, fn);
-}
-
-
-void MacroAssembler::Frintz(const FPRegister& fd, const FPRegister& fn) {
-  DCHECK(allow_macro_instructions_);
-  frintz(fd, fn);
-}
-
-
-void MacroAssembler::Fsqrt(const FPRegister& fd, const FPRegister& fn) {
-  DCHECK(allow_macro_instructions_);
-  fsqrt(fd, fn);
-}
-
-
-void MacroAssembler::Fsub(const FPRegister& fd,
-                          const FPRegister& fn,
-                          const FPRegister& fm) {
+void MacroAssembler::Fsub(const VRegister& fd, const VRegister& fn,
+                          const VRegister& fm) {
  DCHECK(allow_macro_instructions_);
  fsub(fd, fn, fm);
 }
@ -887,7 +812,7 @@ void MacroAssembler::Ldr(const CPURegister& rt, const Immediate& imm) {
 void MacroAssembler::Ldr(const CPURegister& rt, double imm) {
  DCHECK(allow_macro_instructions_);
  DCHECK(rt.Is64Bits());
-  ldr(rt, Immediate(double_to_rawbits(imm)));
+  ldr(rt, Immediate(bit_cast<uint64_t>(imm)));
 }


@ -1070,9 +995,7 @@ void MacroAssembler::Sbfx(const Register& rd,
  sbfx(rd, rn, lsb, width);
 }

-
-void MacroAssembler::Scvtf(const FPRegister& fd,
-                           const Register& rn,
+void MacroAssembler::Scvtf(const VRegister& fd, const Register& rn,
                           unsigned fbits) {
  DCHECK(allow_macro_instructions_);
  scvtf(fd, rn, fbits);
@ -1174,9 +1097,7 @@ void MacroAssembler::Ubfx(const Register& rd,
  ubfx(rd, rn, lsb, width);
 }

-
-void MacroAssembler::Ucvtf(const FPRegister& fd,
-                           const Register& rn,
+void MacroAssembler::Ucvtf(const VRegister& fd, const Register& rn,
                           unsigned fbits) {
  DCHECK(allow_macro_instructions_);
  ucvtf(fd, rn, fbits);
@ -1318,9 +1239,7 @@ void MacroAssembler::SmiUntag(Register dst, Register src) {

 void MacroAssembler::SmiUntag(Register smi) { SmiUntag(smi, smi); }

-
-void MacroAssembler::SmiUntagToDouble(FPRegister dst,
-                                      Register src,
+void MacroAssembler::SmiUntagToDouble(VRegister dst, Register src,
                                      UntagMode mode) {
  DCHECK(dst.Is64Bits() && src.Is64Bits());
  if (FLAG_enable_slow_asserts && (mode == kNotSpeculativeUntag)) {
@ -1329,9 +1248,7 @@ void MacroAssembler::SmiUntagToDouble(FPRegister dst,
  Scvtf(dst, src, kSmiShift);
 }

-
-void MacroAssembler::SmiUntagToFloat(FPRegister dst,
-                                     Register src,
+void MacroAssembler::SmiUntagToFloat(VRegister dst, Register src,
                                     UntagMode mode) {
  DCHECK(dst.Is32Bits() && src.Is64Bits());
  if (FLAG_enable_slow_asserts && (mode == kNotSpeculativeUntag)) {
--- a/src/arm64/macro-assembler-arm64.cc
+++ b/src/arm64/macro-assembler-arm64.cc
@ -295,6 +295,171 @@ void MacroAssembler::Mov(const Register& rd,
  }
 }

+void MacroAssembler::Movi16bitHelper(const VRegister& vd, uint64_t imm) {
+  DCHECK(is_uint16(imm));
+  int byte1 = (imm & 0xff);
+  int byte2 = ((imm >> 8) & 0xff);
+  if (byte1 == byte2) {
+    movi(vd.Is64Bits() ? vd.V8B() : vd.V16B(), byte1);
+  } else if (byte1 == 0) {
+    movi(vd, byte2, LSL, 8);
+  } else if (byte2 == 0) {
+    movi(vd, byte1);
+  } else if (byte1 == 0xff) {
+    mvni(vd, ~byte2 & 0xff, LSL, 8);
+  } else if (byte2 == 0xff) {
+    mvni(vd, ~byte1 & 0xff);
+  } else {
+    UseScratchRegisterScope temps(this);
+    Register temp = temps.AcquireW();
+    movz(temp, imm);
+    dup(vd, temp);
+  }
+}
+
+void MacroAssembler::Movi32bitHelper(const VRegister& vd, uint64_t imm) {
+  DCHECK(is_uint32(imm));
+
+  uint8_t bytes[sizeof(imm)];
+  memcpy(bytes, &imm, sizeof(imm));
+
+  // All bytes are either 0x00 or 0xff.
+  {
+    bool all0orff = true;
+    for (int i = 0; i < 4; ++i) {
+      if ((bytes[i] != 0) && (bytes[i] != 0xff)) {
+        all0orff = false;
+        break;
+      }
+    }
+
+    if (all0orff == true) {
+      movi(vd.Is64Bits() ? vd.V1D() : vd.V2D(), ((imm << 32) | imm));
+      return;
+    }
+  }
+
+  // Of the 4 bytes, only one byte is non-zero.
+  for (int i = 0; i < 4; i++) {
+    if ((imm & (0xff << (i * 8))) == imm) {
+      movi(vd, bytes[i], LSL, i * 8);
+      return;
+    }
+  }
+
+  // Of the 4 bytes, only one byte is not 0xff.
+  for (int i = 0; i < 4; i++) {
+    uint32_t mask = ~(0xff << (i * 8));
+    if ((imm & mask) == mask) {
+      mvni(vd, ~bytes[i] & 0xff, LSL, i * 8);
+      return;
+    }
+  }
+
+  // Immediate is of the form 0x00MMFFFF.
+  if ((imm & 0xff00ffff) == 0x0000ffff) {
+    movi(vd, bytes[2], MSL, 16);
+    return;
+  }
+
+  // Immediate is of the form 0x0000MMFF.
+  if ((imm & 0xffff00ff) == 0x000000ff) {
+    movi(vd, bytes[1], MSL, 8);
+    return;
+  }
+
+  // Immediate is of the form 0xFFMM0000.
+  if ((imm & 0xff00ffff) == 0xff000000) {
+    mvni(vd, ~bytes[2] & 0xff, MSL, 16);
+    return;
+  }
+  // Immediate is of the form 0xFFFFMM00.
+  if ((imm & 0xffff00ff) == 0xffff0000) {
+    mvni(vd, ~bytes[1] & 0xff, MSL, 8);
+    return;
+  }
+
+  // Top and bottom 16-bits are equal.
+  if (((imm >> 16) & 0xffff) == (imm & 0xffff)) {
+    Movi16bitHelper(vd.Is64Bits() ? vd.V4H() : vd.V8H(), imm & 0xffff);
+    return;
+  }
+
+  // Default case.
+  {
+    UseScratchRegisterScope temps(this);
+    Register temp = temps.AcquireW();
+    Mov(temp, imm);
+    dup(vd, temp);
+  }
+}
+
+void MacroAssembler::Movi64bitHelper(const VRegister& vd, uint64_t imm) {
+  // All bytes are either 0x00 or 0xff.
+  {
+    bool all0orff = true;
+    for (int i = 0; i < 8; ++i) {
+      int byteval = (imm >> (i * 8)) & 0xff;
+      if (byteval != 0 && byteval != 0xff) {
+        all0orff = false;
+        break;
+      }
+    }
+    if (all0orff == true) {
+      movi(vd, imm);
+      return;
+    }
+  }
+
+  // Top and bottom 32-bits are equal.
+  if (((imm >> 32) & 0xffffffff) == (imm & 0xffffffff)) {
+    Movi32bitHelper(vd.Is64Bits() ? vd.V2S() : vd.V4S(), imm & 0xffffffff);
+    return;
+  }
+
+  // Default case.
+  {
+    UseScratchRegisterScope temps(this);
+    Register temp = temps.AcquireX();
+    Mov(temp, imm);
+    if (vd.Is1D()) {
+      mov(vd.D(), 0, temp);
+    } else {
+      dup(vd.V2D(), temp);
+    }
+  }
+}
+
+void MacroAssembler::Movi(const VRegister& vd, uint64_t imm, Shift shift,
+                          int shift_amount) {
+  DCHECK(allow_macro_instructions_);
+  if (shift_amount != 0 || shift != LSL) {
+    movi(vd, imm, shift, shift_amount);
+  } else if (vd.Is8B() || vd.Is16B()) {
+    // 8-bit immediate.
+    DCHECK(is_uint8(imm));
+    movi(vd, imm);
+  } else if (vd.Is4H() || vd.Is8H()) {
+    // 16-bit immediate.
+    Movi16bitHelper(vd, imm);
+  } else if (vd.Is2S() || vd.Is4S()) {
+    // 32-bit immediate.
+    Movi32bitHelper(vd, imm);
+  } else {
+    // 64-bit immediate.
+    Movi64bitHelper(vd, imm);
+  }
+}
+
+void MacroAssembler::Movi(const VRegister& vd, uint64_t hi, uint64_t lo) {
+  // TODO(all): Move 128-bit values in a more efficient way.
+  DCHECK(vd.Is128Bits());
+  UseScratchRegisterScope temps(this);
+  Movi(vd.V2D(), lo);
+  Register temp = temps.AcquireX();
+  Mov(temp, hi);
+  Ins(vd.V2D(), 1, temp);
+}

 void MacroAssembler::Mvn(const Register& rd, const Operand& operand) {
  DCHECK(allow_macro_instructions_);
@ -566,7 +731,7 @@ void MacroAssembler::LoadStoreMacro(const CPURegister& rt,
                                    const MemOperand& addr,
                                    LoadStoreOp op) {
  int64_t offset = addr.offset();
-  LSDataSize size = CalcLSDataSize(op);
+  unsigned size = CalcLSDataSize(op);

  // Check if an immediate offset fits in the immediate field of the
  // appropriate instruction. If not, emit two instructions to perform
@ -601,7 +766,7 @@ void MacroAssembler::LoadStorePairMacro(const CPURegister& rt,
  DCHECK(!addr.IsRegisterOffset());

  int64_t offset = addr.offset();
-  LSDataSize size = CalcLSPairDataSize(op);
+  unsigned size = CalcLSPairDataSize(op);

  // Check if the offset fits in the immediate field of the appropriate
  // instruction. If not, emit two instructions to perform the operation.
@ -929,8 +1094,7 @@ void MacroAssembler::Pop(const CPURegister& dst0, const CPURegister& dst1,
  PopPostamble(count, size);
 }

-
-void MacroAssembler::Push(const Register& src0, const FPRegister& src1) {
+void MacroAssembler::Push(const Register& src0, const VRegister& src1) {
  int size = src0.SizeInBytes() + src1.SizeInBytes();

  PushPreamble(size);
@ -1397,9 +1561,8 @@ void MacroAssembler::AssertFPCRState(Register fpcr) {
  }
 }

-
-void MacroAssembler::CanonicalizeNaN(const FPRegister& dst,
-                                     const FPRegister& src) {
+void MacroAssembler::CanonicalizeNaN(const VRegister& dst,
+                                     const VRegister& src) {
  AssertFPCRState();

  // Subtracting 0.0 preserves all inputs except for signalling NaNs, which
@ -2076,10 +2239,8 @@ void MacroAssembler::JumpIfNotHeapNumber(Register object,
  JumpIfNotRoot(temp, Heap::kHeapNumberMapRootIndex, on_not_heap_number);
 }

-
-void MacroAssembler::TryRepresentDoubleAsInt(Register as_int,
-                                             FPRegister value,
-                                             FPRegister scratch_d,
+void MacroAssembler::TryRepresentDoubleAsInt(Register as_int, VRegister value,
+                                             VRegister scratch_d,
                                             Label* on_successful_conversion,
                                             Label* on_failed_conversion) {
  // Convert to an int and back again, then compare with the original value.
@ -2693,14 +2854,14 @@ void MacroAssembler::LeaveFrame(StackFrame::Type type) {


 void MacroAssembler::ExitFramePreserveFPRegs() {
-  PushCPURegList(kCallerSavedFP);
+  PushCPURegList(kCallerSavedV);
 }


 void MacroAssembler::ExitFrameRestoreFPRegs() {
  // Read the registers from the stack without popping them. The stack pointer
  // will be reset as part of the unwinding process.
-  CPURegList saved_fp_regs = kCallerSavedFP;
+  CPURegList saved_fp_regs = kCallerSavedV;
  DCHECK(saved_fp_regs.Count() % 2 == 0);

  int offset = ExitFrameConstants::kLastExitFrameField;
@ -3179,7 +3340,7 @@ void MacroAssembler::AllocateHeapNumber(Register result,
  if (!heap_number_map.IsValid()) {
    // If we have a valid value register, use the same type of register to store
    // the map so we can use STP to store both in one instruction.
-    if (value.IsValid() && value.IsFPRegister()) {
+    if (value.IsValid() && value.IsVRegister()) {
      heap_number_map = temps.AcquireD();
    } else {
      heap_number_map = scratch1;
@ -3188,7 +3349,7 @@ void MacroAssembler::AllocateHeapNumber(Register result,
  }
  if (emit_debug_code()) {
    Register map;
-    if (heap_number_map.IsFPRegister()) {
+    if (heap_number_map.IsVRegister()) {
      map = scratch1;
      Fmov(map, DoubleRegister(heap_number_map));
    } else {
@ -3650,14 +3811,14 @@ void MacroAssembler::PushSafepointRegisters() {
 void MacroAssembler::PushSafepointRegistersAndDoubles() {
  PushSafepointRegisters();
  PushCPURegList(CPURegList(
-      CPURegister::kFPRegister, kDRegSizeInBits,
+      CPURegister::kVRegister, kDRegSizeInBits,
      RegisterConfiguration::Crankshaft()->allocatable_double_codes_mask()));
 }


 void MacroAssembler::PopSafepointRegistersAndDoubles() {
  PopCPURegList(CPURegList(
-      CPURegister::kFPRegister, kDRegSizeInBits,
+      CPURegister::kVRegister, kDRegSizeInBits,
      RegisterConfiguration::Crankshaft()->allocatable_double_codes_mask()));
  PopSafepointRegisters();
 }
@ -4208,7 +4369,7 @@ void MacroAssembler::PrintfNoPreserve(const char * format,
  static const CPURegList kPCSVarargs =
      CPURegList(CPURegister::kRegister, kXRegSizeInBits, 1, arg_count);
  static const CPURegList kPCSVarargsFP =
-      CPURegList(CPURegister::kFPRegister, kDRegSizeInBits, 0, arg_count - 1);
+      CPURegList(CPURegister::kVRegister, kDRegSizeInBits, 0, arg_count - 1);

  // We can use caller-saved registers as scratch values, except for the
  // arguments and the PCS registers where they might need to go.
@ -4217,7 +4378,7 @@ void MacroAssembler::PrintfNoPreserve(const char * format,
  tmp_list.Remove(kPCSVarargs);
  tmp_list.Remove(arg0, arg1, arg2, arg3);

-  CPURegList fp_tmp_list = kCallerSavedFP;
+  CPURegList fp_tmp_list = kCallerSavedV;
  fp_tmp_list.Remove(kPCSVarargsFP);
  fp_tmp_list.Remove(arg0, arg1, arg2, arg3);

@ -4242,7 +4403,7 @@ void MacroAssembler::PrintfNoPreserve(const char * format,
      // We might only need a W register here. We need to know the size of the
      // argument so we can properly encode it for the simulator call.
      if (args[i].Is32Bits()) pcs[i] = pcs[i].W();
-    } else if (args[i].IsFPRegister()) {
+    } else if (args[i].IsVRegister()) {
      // In C, floats are always cast to doubles for varargs calls.
      pcs[i] = pcs_varargs_fp.PopLowestIndex().D();
    } else {
@ -4264,8 +4425,8 @@ void MacroAssembler::PrintfNoPreserve(const char * format,
        Mov(new_arg, old_arg);
        args[i] = new_arg;
      } else {
-        FPRegister old_arg = FPRegister(args[i]);
-        FPRegister new_arg = temps.AcquireSameSizeAs(old_arg);
+        VRegister old_arg = VRegister(args[i]);
+        VRegister new_arg = temps.AcquireSameSizeAs(old_arg);
        Fmov(new_arg, old_arg);
        args[i] = new_arg;
      }
@ -4279,11 +4440,11 @@ void MacroAssembler::PrintfNoPreserve(const char * format,
    if (pcs[i].IsRegister()) {
      Mov(Register(pcs[i]), Register(args[i]), kDiscardForSameWReg);
    } else {
-      DCHECK(pcs[i].IsFPRegister());
+      DCHECK(pcs[i].IsVRegister());
      if (pcs[i].SizeInBytes() == args[i].SizeInBytes()) {
-        Fmov(FPRegister(pcs[i]), FPRegister(args[i]));
+        Fmov(VRegister(pcs[i]), VRegister(args[i]));
      } else {
-        Fcvt(FPRegister(pcs[i]), FPRegister(args[i]));
+        Fcvt(VRegister(pcs[i]), VRegister(args[i]));
      }
    }
  }
@ -4371,11 +4532,11 @@ void MacroAssembler::Printf(const char * format,
  // If csp is the stack pointer, PushCPURegList asserts that the size of each
  // list is a multiple of 16 bytes.
  PushCPURegList(kCallerSaved);
-  PushCPURegList(kCallerSavedFP);
+  PushCPURegList(kCallerSavedV);

  // We can use caller-saved registers as scratch values (except for argN).
  CPURegList tmp_list = kCallerSaved;
-  CPURegList fp_tmp_list = kCallerSavedFP;
+  CPURegList fp_tmp_list = kCallerSavedV;
  tmp_list.Remove(arg0, arg1, arg2, arg3);
  fp_tmp_list.Remove(arg0, arg1, arg2, arg3);
  TmpList()->set_list(tmp_list.list());
@ -4394,7 +4555,7 @@ void MacroAssembler::Printf(const char * format,
      // to PrintfNoPreserve as an argument.
      Register arg_sp = temps.AcquireX();
      Add(arg_sp, StackPointer(),
-          kCallerSaved.TotalSizeInBytes() + kCallerSavedFP.TotalSizeInBytes());
+          kCallerSaved.TotalSizeInBytes() + kCallerSavedV.TotalSizeInBytes());
      if (arg0_sp) arg0 = Register::Create(arg_sp.code(), arg0.SizeInBits());
      if (arg1_sp) arg1 = Register::Create(arg_sp.code(), arg1.SizeInBits());
      if (arg2_sp) arg2 = Register::Create(arg_sp.code(), arg2.SizeInBits());
@ -4418,7 +4579,7 @@ void MacroAssembler::Printf(const char * format,
    }
  }

-  PopCPURegList(kCallerSavedFP);
+  PopCPURegList(kCallerSavedV);
  PopCPURegList(kCallerSaved);

  TmpList()->set_list(old_tmp_list);
@ -4532,10 +4693,9 @@ Register UseScratchRegisterScope::AcquireSameSizeAs(const Register& reg) {
  return Register::Create(code, reg.SizeInBits());
 }

-
-FPRegister UseScratchRegisterScope::AcquireSameSizeAs(const FPRegister& reg) {
+VRegister UseScratchRegisterScope::AcquireSameSizeAs(const VRegister& reg) {
  int code = AcquireNextAvailable(availablefp_).code();
-  return FPRegister::Create(code, reg.SizeInBits());
+  return VRegister::Create(code, reg.SizeInBits());
 }


--- a/src/arm64/macro-assembler-arm64.h
+++ b/src/arm64/macro-assembler-arm64.h
@ -396,88 +396,85 @@ class MacroAssembler : public Assembler {
                   const Register& rn,
                   const Register& rm,
                   unsigned lsb);
-  inline void Fabs(const FPRegister& fd, const FPRegister& fn);
-  inline void Fadd(const FPRegister& fd,
-                   const FPRegister& fn,
-                   const FPRegister& fm);
-  inline void Fccmp(const FPRegister& fn,
-                    const FPRegister& fm,
-                    StatusFlags nzcv,
+  inline void Fabs(const VRegister& fd, const VRegister& fn);
+  inline void Fadd(const VRegister& fd, const VRegister& fn,
+                   const VRegister& fm);
+  inline void Fccmp(const VRegister& fn, const VRegister& fm, StatusFlags nzcv,
                    Condition cond);
-  inline void Fcmp(const FPRegister& fn, const FPRegister& fm);
-  inline void Fcmp(const FPRegister& fn, double value);
-  inline void Fcsel(const FPRegister& fd,
-                    const FPRegister& fn,
-                    const FPRegister& fm,
-                    Condition cond);
-  inline void Fcvt(const FPRegister& fd, const FPRegister& fn);
-  inline void Fcvtas(const Register& rd, const FPRegister& fn);
-  inline void Fcvtau(const Register& rd, const FPRegister& fn);
-  inline void Fcvtms(const Register& rd, const FPRegister& fn);
-  inline void Fcvtmu(const Register& rd, const FPRegister& fn);
-  inline void Fcvtns(const Register& rd, const FPRegister& fn);
-  inline void Fcvtnu(const Register& rd, const FPRegister& fn);
-  inline void Fcvtzs(const Register& rd, const FPRegister& fn);
-  inline void Fcvtzu(const Register& rd, const FPRegister& fn);
-  inline void Fdiv(const FPRegister& fd,
-                   const FPRegister& fn,
-                   const FPRegister& fm);
-  inline void Fmadd(const FPRegister& fd,
-                    const FPRegister& fn,
-                    const FPRegister& fm,
-                    const FPRegister& fa);
-  inline void Fmax(const FPRegister& fd,
-                   const FPRegister& fn,
-                   const FPRegister& fm);
-  inline void Fmaxnm(const FPRegister& fd,
-                     const FPRegister& fn,
-                     const FPRegister& fm);
-  inline void Fmin(const FPRegister& fd,
-                   const FPRegister& fn,
-                   const FPRegister& fm);
-  inline void Fminnm(const FPRegister& fd,
-                     const FPRegister& fn,
-                     const FPRegister& fm);
-  inline void Fmov(FPRegister fd, FPRegister fn);
-  inline void Fmov(FPRegister fd, Register rn);
+  inline void Fcmp(const VRegister& fn, const VRegister& fm);
+  inline void Fcmp(const VRegister& fn, double value);
+  inline void Fcsel(const VRegister& fd, const VRegister& fn,
+                    const VRegister& fm, Condition cond);
+  inline void Fcvt(const VRegister& fd, const VRegister& fn);
+  void Fcvtl(const VRegister& vd, const VRegister& vn) {
+    DCHECK(allow_macro_instructions_);
+    fcvtl(vd, vn);
+  }
+  void Fcvtl2(const VRegister& vd, const VRegister& vn) {
+    DCHECK(allow_macro_instructions_);
+    fcvtl2(vd, vn);
+  }
+  void Fcvtn(const VRegister& vd, const VRegister& vn) {
+    DCHECK(allow_macro_instructions_);
+    fcvtn(vd, vn);
+  }
+  void Fcvtn2(const VRegister& vd, const VRegister& vn) {
+    DCHECK(allow_macro_instructions_);
+    fcvtn2(vd, vn);
+  }
+  void Fcvtxn(const VRegister& vd, const VRegister& vn) {
+    DCHECK(allow_macro_instructions_);
+    fcvtxn(vd, vn);
+  }
+  void Fcvtxn2(const VRegister& vd, const VRegister& vn) {
+    DCHECK(allow_macro_instructions_);
+    fcvtxn2(vd, vn);
+  }
+  inline void Fcvtas(const Register& rd, const VRegister& fn);
+  inline void Fcvtau(const Register& rd, const VRegister& fn);
+  inline void Fcvtms(const Register& rd, const VRegister& fn);
+  inline void Fcvtmu(const Register& rd, const VRegister& fn);
+  inline void Fcvtns(const Register& rd, const VRegister& fn);
+  inline void Fcvtnu(const Register& rd, const VRegister& fn);
+  inline void Fcvtzs(const Register& rd, const VRegister& fn);
+  inline void Fcvtzu(const Register& rd, const VRegister& fn);
+  inline void Fdiv(const VRegister& fd, const VRegister& fn,
+                   const VRegister& fm);
+  inline void Fmadd(const VRegister& fd, const VRegister& fn,
+                    const VRegister& fm, const VRegister& fa);
+  inline void Fmax(const VRegister& fd, const VRegister& fn,
+                   const VRegister& fm);
+  inline void Fmaxnm(const VRegister& fd, const VRegister& fn,
+                     const VRegister& fm);
+  inline void Fmin(const VRegister& fd, const VRegister& fn,
+                   const VRegister& fm);
+  inline void Fminnm(const VRegister& fd, const VRegister& fn,
+                     const VRegister& fm);
+  inline void Fmov(VRegister fd, VRegister fn);
+  inline void Fmov(VRegister fd, Register rn);
  // Provide explicit double and float interfaces for FP immediate moves, rather
  // than relying on implicit C++ casts. This allows signalling NaNs to be
  // preserved when the immediate matches the format of fd. Most systems convert
  // signalling NaNs to quiet NaNs when converting between float and double.
-  inline void Fmov(FPRegister fd, double imm);
-  inline void Fmov(FPRegister fd, float imm);
+  inline void Fmov(VRegister fd, double imm);
+  inline void Fmov(VRegister fd, float imm);
  // Provide a template to allow other types to be converted automatically.
-  template<typename T>
-  void Fmov(FPRegister fd, T imm) {
+  template <typename T>
+  void Fmov(VRegister fd, T imm) {
    DCHECK(allow_macro_instructions_);
    Fmov(fd, static_cast<double>(imm));
  }
-  inline void Fmov(Register rd, FPRegister fn);
-  inline void Fmsub(const FPRegister& fd,
-                    const FPRegister& fn,
-                    const FPRegister& fm,
-                    const FPRegister& fa);
-  inline void Fmul(const FPRegister& fd,
-                   const FPRegister& fn,
-                   const FPRegister& fm);
-  inline void Fneg(const FPRegister& fd, const FPRegister& fn);
-  inline void Fnmadd(const FPRegister& fd,
-                     const FPRegister& fn,
-                     const FPRegister& fm,
-                     const FPRegister& fa);
-  inline void Fnmsub(const FPRegister& fd,
-                     const FPRegister& fn,
-                     const FPRegister& fm,
-                     const FPRegister& fa);
-  inline void Frinta(const FPRegister& fd, const FPRegister& fn);
-  inline void Frintm(const FPRegister& fd, const FPRegister& fn);
-  inline void Frintn(const FPRegister& fd, const FPRegister& fn);
-  inline void Frintp(const FPRegister& fd, const FPRegister& fn);
-  inline void Frintz(const FPRegister& fd, const FPRegister& fn);
-  inline void Fsqrt(const FPRegister& fd, const FPRegister& fn);
-  inline void Fsub(const FPRegister& fd,
-                   const FPRegister& fn,
-                   const FPRegister& fm);
+  inline void Fmov(Register rd, VRegister fn);
+  inline void Fmsub(const VRegister& fd, const VRegister& fn,
+                    const VRegister& fm, const VRegister& fa);
+  inline void Fmul(const VRegister& fd, const VRegister& fn,
+                   const VRegister& fm);
+  inline void Fnmadd(const VRegister& fd, const VRegister& fn,
+                     const VRegister& fm, const VRegister& fa);
+  inline void Fnmsub(const VRegister& fd, const VRegister& fn,
+                     const VRegister& fm, const VRegister& fa);
+  inline void Fsub(const VRegister& fd, const VRegister& fn,
+                   const VRegister& fm);
  inline void Hint(SystemHint code);
  inline void Hlt(int code);
  inline void Isb();
@ -507,6 +504,76 @@ class MacroAssembler : public Assembler {
                   const Register& ra);
  inline void Mul(const Register& rd, const Register& rn, const Register& rm);
  inline void Nop() { nop(); }
+  void Dup(const VRegister& vd, const VRegister& vn, int index) {
+    DCHECK(allow_macro_instructions_);
+    dup(vd, vn, index);
+  }
+  void Dup(const VRegister& vd, const Register& rn) {
+    DCHECK(allow_macro_instructions_);
+    dup(vd, rn);
+  }
+  void Ins(const VRegister& vd, int vd_index, const VRegister& vn,
+           int vn_index) {
+    DCHECK(allow_macro_instructions_);
+    ins(vd, vd_index, vn, vn_index);
+  }
+  void Ins(const VRegister& vd, int vd_index, const Register& rn) {
+    DCHECK(allow_macro_instructions_);
+    ins(vd, vd_index, rn);
+  }
+  void Mov(const VRegister& vd, int vd_index, const VRegister& vn,
+           int vn_index) {
+    DCHECK(allow_macro_instructions_);
+    mov(vd, vd_index, vn, vn_index);
+  }
+  void Mov(const VRegister& vd, const VRegister& vn, int index) {
+    DCHECK(allow_macro_instructions_);
+    mov(vd, vn, index);
+  }
+  void Mov(const VRegister& vd, int vd_index, const Register& rn) {
+    DCHECK(allow_macro_instructions_);
+    mov(vd, vd_index, rn);
+  }
+  void Mov(const Register& rd, const VRegister& vn, int vn_index) {
+    DCHECK(allow_macro_instructions_);
+    mov(rd, vn, vn_index);
+  }
+  void Movi(const VRegister& vd, uint64_t imm, Shift shift = LSL,
+            int shift_amount = 0);
+  void Movi(const VRegister& vd, uint64_t hi, uint64_t lo);
+  void Mvni(const VRegister& vd, const int imm8, Shift shift = LSL,
+            const int shift_amount = 0) {
+    DCHECK(allow_macro_instructions_);
+    mvni(vd, imm8, shift, shift_amount);
+  }
+  void Orr(const VRegister& vd, const int imm8, const int left_shift = 0) {
+    DCHECK(allow_macro_instructions_);
+    orr(vd, imm8, left_shift);
+  }
+  void Scvtf(const VRegister& vd, const VRegister& vn, int fbits = 0) {
+    DCHECK(allow_macro_instructions_);
+    scvtf(vd, vn, fbits);
+  }
+  void Ucvtf(const VRegister& vd, const VRegister& vn, int fbits = 0) {
+    DCHECK(allow_macro_instructions_);
+    ucvtf(vd, vn, fbits);
+  }
+  void Fcvtzs(const VRegister& vd, const VRegister& vn, int fbits = 0) {
+    DCHECK(allow_macro_instructions_);
+    fcvtzs(vd, vn, fbits);
+  }
+  void Fcvtzu(const VRegister& vd, const VRegister& vn, int fbits = 0) {
+    DCHECK(allow_macro_instructions_);
+    fcvtzu(vd, vn, fbits);
+  }
+  void Smov(const Register& rd, const VRegister& vn, int vn_index) {
+    DCHECK(allow_macro_instructions_);
+    smov(rd, vn, vn_index);
+  }
+  void Umov(const Register& rd, const VRegister& vn, int vn_index) {
+    DCHECK(allow_macro_instructions_);
+    umov(rd, vn, vn_index);
+  }
  inline void Rbit(const Register& rd, const Register& rn);
  inline void Ret(const Register& xn = lr);
  inline void Rev(const Register& rd, const Register& rn);
@ -522,8 +589,7 @@ class MacroAssembler : public Assembler {
                   const Register& rn,
                   unsigned lsb,
                   unsigned width);
-  inline void Scvtf(const FPRegister& fd,
-                    const Register& rn,
+  inline void Scvtf(const VRegister& fd, const Register& rn,
                    unsigned fbits = 0);
  inline void Sdiv(const Register& rd, const Register& rn, const Register& rm);
  inline void Smaddl(const Register& rd,
@ -557,8 +623,7 @@ class MacroAssembler : public Assembler {
                   const Register& rn,
                   unsigned lsb,
                   unsigned width);
-  inline void Ucvtf(const FPRegister& fd,
-                    const Register& rn,
+  inline void Ucvtf(const VRegister& fd, const Register& rn,
                    unsigned fbits = 0);
  inline void Udiv(const Register& rd, const Register& rn, const Register& rm);
  inline void Umaddl(const Register& rd,
@ -573,6 +638,516 @@ class MacroAssembler : public Assembler {
  inline void Uxth(const Register& rd, const Register& rn);
  inline void Uxtw(const Register& rd, const Register& rn);

+// NEON 3 vector register instructions.
+#define NEON_3VREG_MACRO_LIST(V) \
+  V(add, Add)                    \
+  V(addhn, Addhn)                \
+  V(addhn2, Addhn2)              \
+  V(addp, Addp)                  \
+  V(and_, And)                   \
+  V(bic, Bic)                    \
+  V(bif, Bif)                    \
+  V(bit, Bit)                    \
+  V(bsl, Bsl)                    \
+  V(cmeq, Cmeq)                  \
+  V(cmge, Cmge)                  \
+  V(cmgt, Cmgt)                  \
+  V(cmhi, Cmhi)                  \
+  V(cmhs, Cmhs)                  \
+  V(cmtst, Cmtst)                \
+  V(eor, Eor)                    \
+  V(fabd, Fabd)                  \
+  V(facge, Facge)                \
+  V(facgt, Facgt)                \
+  V(faddp, Faddp)                \
+  V(fcmeq, Fcmeq)                \
+  V(fcmge, Fcmge)                \
+  V(fcmgt, Fcmgt)                \
+  V(fmaxnmp, Fmaxnmp)            \
+  V(fmaxp, Fmaxp)                \
+  V(fminnmp, Fminnmp)            \
+  V(fminp, Fminp)                \
+  V(fmla, Fmla)                  \
+  V(fmls, Fmls)                  \
+  V(fmulx, Fmulx)                \
+  V(frecps, Frecps)              \
+  V(frsqrts, Frsqrts)            \
+  V(mla, Mla)                    \
+  V(mls, Mls)                    \
+  V(mul, Mul)                    \
+  V(orn, Orn)                    \
+  V(pmul, Pmul)                  \
+  V(pmull, Pmull)                \
+  V(pmull2, Pmull2)              \
+  V(raddhn, Raddhn)              \
+  V(raddhn2, Raddhn2)            \
+  V(rsubhn, Rsubhn)              \
+  V(rsubhn2, Rsubhn2)            \
+  V(sqadd, Sqadd)                \
+  V(sqdmlal, Sqdmlal)            \
+  V(sqdmlal2, Sqdmlal2)          \
+  V(sqdmulh, Sqdmulh)            \
+  V(sqdmull, Sqdmull)            \
+  V(sqdmull2, Sqdmull2)          \
+  V(sqrdmulh, Sqrdmulh)          \
+  V(sqrshl, Sqrshl)              \
+  V(sqshl, Sqshl)                \
+  V(sqsub, Sqsub)                \
+  V(srhadd, Srhadd)              \
+  V(srshl, Srshl)                \
+  V(sshl, Sshl)                  \
+  V(ssubl, Ssubl)                \
+  V(ssubl2, Ssubl2)              \
+  V(ssubw, Ssubw)                \
+  V(ssubw2, Ssubw2)              \
+  V(sub, Sub)                    \
+  V(subhn, Subhn)                \
+  V(subhn2, Subhn2)              \
+  V(trn1, Trn1)                  \
+  V(trn2, Trn2)                  \
+  V(orr, Orr)                    \
+  V(saba, Saba)                  \
+  V(sabal, Sabal)                \
+  V(sabal2, Sabal2)              \
+  V(sabd, Sabd)                  \
+  V(sabdl, Sabdl)                \
+  V(sabdl2, Sabdl2)              \
+  V(saddl, Saddl)                \
+  V(saddl2, Saddl2)              \
+  V(saddw, Saddw)                \
+  V(saddw2, Saddw2)              \
+  V(shadd, Shadd)                \
+  V(shsub, Shsub)                \
+  V(smax, Smax)                  \
+  V(smaxp, Smaxp)                \
+  V(smin, Smin)                  \
+  V(sminp, Sminp)                \
+  V(smlal, Smlal)                \
+  V(smlal2, Smlal2)              \
+  V(smlsl, Smlsl)                \
+  V(smlsl2, Smlsl2)              \
+  V(smull, Smull)                \
+  V(smull2, Smull2)              \
+  V(sqdmlsl, Sqdmlsl)            \
+  V(sqdmlsl2, Sqdmlsl2)          \
+  V(uaba, Uaba)                  \
+  V(uabal, Uabal)                \
+  V(uabal2, Uabal2)              \
+  V(uabd, Uabd)                  \
+  V(uabdl, Uabdl)                \
+  V(uabdl2, Uabdl2)              \
+  V(uaddl, Uaddl)                \
+  V(uaddl2, Uaddl2)              \
+  V(uaddw, Uaddw)                \
+  V(uaddw2, Uaddw2)              \
+  V(uhadd, Uhadd)                \
+  V(uhsub, Uhsub)                \
+  V(umax, Umax)                  \
+  V(umin, Umin)                  \
+  V(umlsl, Umlsl)                \
+  V(umlsl2, Umlsl2)              \
+  V(umull, Umull)                \
+  V(umull2, Umull2)              \
+  V(umaxp, Umaxp)                \
+  V(uminp, Uminp)                \
+  V(umlal, Umlal)                \
+  V(umlal2, Umlal2)              \
+  V(uqadd, Uqadd)                \
+  V(uqrshl, Uqrshl)              \
+  V(uqshl, Uqshl)                \
+  V(uqsub, Uqsub)                \
+  V(urhadd, Urhadd)              \
+  V(urshl, Urshl)                \
+  V(ushl, Ushl)                  \
+  V(usubl, Usubl)                \
+  V(usubl2, Usubl2)              \
+  V(usubw, Usubw)                \
+  V(usubw2, Usubw2)              \
+  V(uzp1, Uzp1)                  \
+  V(uzp2, Uzp2)                  \
+  V(zip1, Zip1)                  \
+  V(zip2, Zip2)
+
+#define DEFINE_MACRO_ASM_FUNC(ASM, MASM)                                     \
+  void MASM(const VRegister& vd, const VRegister& vn, const VRegister& vm) { \
+    DCHECK(allow_macro_instructions_);                                       \
+    ASM(vd, vn, vm);                                                         \
+  }
+  NEON_3VREG_MACRO_LIST(DEFINE_MACRO_ASM_FUNC)
+#undef DEFINE_MACRO_ASM_FUNC
+
+  void Ext(const VRegister& vd, const VRegister& vn, const VRegister& vm,
+           int index) {
+    DCHECK(allow_macro_instructions_);
+    ext(vd, vn, vm, index);
+  }
+
+// NEON 2 vector register instructions.
+#define NEON_2VREG_MACRO_LIST(V) \
+  V(abs, Abs)                    \
+  V(addp, Addp)                  \
+  V(addv, Addv)                  \
+  V(cls, Cls)                    \
+  V(clz, Clz)                    \
+  V(cnt, Cnt)                    \
+  V(faddp, Faddp)                \
+  V(fcvtas, Fcvtas)              \
+  V(fcvtau, Fcvtau)              \
+  V(fcvtms, Fcvtms)              \
+  V(fcvtmu, Fcvtmu)              \
+  V(fcvtns, Fcvtns)              \
+  V(fcvtnu, Fcvtnu)              \
+  V(fcvtps, Fcvtps)              \
+  V(fcvtpu, Fcvtpu)              \
+  V(fmaxnmp, Fmaxnmp)            \
+  V(fmaxnmv, Fmaxnmv)            \
+  V(fmaxv, Fmaxv)                \
+  V(fminnmp, Fminnmp)            \
+  V(fminnmv, Fminnmv)            \
+  V(fminp, Fminp)                \
+  V(fmaxp, Fmaxp)                \
+  V(fminv, Fminv)                \
+  V(fneg, Fneg)                  \
+  V(frecpe, Frecpe)              \
+  V(frecpx, Frecpx)              \
+  V(frinta, Frinta)              \
+  V(frinti, Frinti)              \
+  V(frintm, Frintm)              \
+  V(frintn, Frintn)              \
+  V(frintp, Frintp)              \
+  V(frintx, Frintx)              \
+  V(frintz, Frintz)              \
+  V(frsqrte, Frsqrte)            \
+  V(fsqrt, Fsqrt)                \
+  V(mov, Mov)                    \
+  V(mvn, Mvn)                    \
+  V(neg, Neg)                    \
+  V(not_, Not)                   \
+  V(rbit, Rbit)                  \
+  V(rev16, Rev16)                \
+  V(rev32, Rev32)                \
+  V(rev64, Rev64)                \
+  V(sadalp, Sadalp)              \
+  V(saddlv, Saddlv)              \
+  V(smaxv, Smaxv)                \
+  V(sminv, Sminv)                \
+  V(saddlp, Saddlp)              \
+  V(sqabs, Sqabs)                \
+  V(sqneg, Sqneg)                \
+  V(sqxtn, Sqxtn)                \
+  V(sqxtn2, Sqxtn2)              \
+  V(sqxtun, Sqxtun)              \
+  V(sqxtun2, Sqxtun2)            \
+  V(suqadd, Suqadd)              \
+  V(sxtl, Sxtl)                  \
+  V(sxtl2, Sxtl2)                \
+  V(uadalp, Uadalp)              \
+  V(uaddlp, Uaddlp)              \
+  V(uaddlv, Uaddlv)              \
+  V(umaxv, Umaxv)                \
+  V(uminv, Uminv)                \
+  V(uqxtn, Uqxtn)                \
+  V(uqxtn2, Uqxtn2)              \
+  V(urecpe, Urecpe)              \
+  V(ursqrte, Ursqrte)            \
+  V(usqadd, Usqadd)              \
+  V(uxtl, Uxtl)                  \
+  V(uxtl2, Uxtl2)                \
+  V(xtn, Xtn)                    \
+  V(xtn2, Xtn2)
+
+#define DEFINE_MACRO_ASM_FUNC(ASM, MASM)                \
+  void MASM(const VRegister& vd, const VRegister& vn) { \
+    DCHECK(allow_macro_instructions_);                  \
+    ASM(vd, vn);                                        \
+  }
+  NEON_2VREG_MACRO_LIST(DEFINE_MACRO_ASM_FUNC)
+#undef DEFINE_MACRO_ASM_FUNC
+
+// NEON 2 vector register with immediate instructions.
+#define NEON_2VREG_FPIMM_MACRO_LIST(V) \
+  V(fcmeq, Fcmeq)                      \
+  V(fcmge, Fcmge)                      \
+  V(fcmgt, Fcmgt)                      \
+  V(fcmle, Fcmle)                      \
+  V(fcmlt, Fcmlt)
+
+#define DEFINE_MACRO_ASM_FUNC(ASM, MASM)                            \
+  void MASM(const VRegister& vd, const VRegister& vn, double imm) { \
+    DCHECK(allow_macro_instructions_);                              \
+    ASM(vd, vn, imm);                                               \
+  }
+  NEON_2VREG_FPIMM_MACRO_LIST(DEFINE_MACRO_ASM_FUNC)
+#undef DEFINE_MACRO_ASM_FUNC
+
+  void Bic(const VRegister& vd, const int imm8, const int left_shift = 0) {
+    DCHECK(allow_macro_instructions_);
+    bic(vd, imm8, left_shift);
+  }
+  void Cmeq(const VRegister& vd, const VRegister& vn, int imm) {
+    DCHECK(allow_macro_instructions_);
+    cmeq(vd, vn, imm);
+  }
+  void Cmge(const VRegister& vd, const VRegister& vn, int imm) {
+    DCHECK(allow_macro_instructions_);
+    cmge(vd, vn, imm);
+  }
+  void Cmgt(const VRegister& vd, const VRegister& vn, int imm) {
+    DCHECK(allow_macro_instructions_);
+    cmgt(vd, vn, imm);
+  }
+  void Cmle(const VRegister& vd, const VRegister& vn, int imm) {
+    DCHECK(allow_macro_instructions_);
+    cmle(vd, vn, imm);
+  }
+  void Cmlt(const VRegister& vd, const VRegister& vn, int imm) {
+    DCHECK(allow_macro_instructions_);
+    cmlt(vd, vn, imm);
+  }
+// NEON by element instructions.
+#define NEON_BYELEMENT_MACRO_LIST(V) \
+  V(fmul, Fmul)                      \
+  V(fmla, Fmla)                      \
+  V(fmls, Fmls)                      \
+  V(fmulx, Fmulx)                    \
+  V(mul, Mul)                        \
+  V(mla, Mla)                        \
+  V(mls, Mls)                        \
+  V(sqdmulh, Sqdmulh)                \
+  V(sqrdmulh, Sqrdmulh)              \
+  V(sqdmull, Sqdmull)                \
+  V(sqdmull2, Sqdmull2)              \
+  V(sqdmlal, Sqdmlal)                \
+  V(sqdmlal2, Sqdmlal2)              \
+  V(sqdmlsl, Sqdmlsl)                \
+  V(sqdmlsl2, Sqdmlsl2)              \
+  V(smull, Smull)                    \
+  V(smull2, Smull2)                  \
+  V(smlal, Smlal)                    \
+  V(smlal2, Smlal2)                  \
+  V(smlsl, Smlsl)                    \
+  V(smlsl2, Smlsl2)                  \
+  V(umull, Umull)                    \
+  V(umull2, Umull2)                  \
+  V(umlal, Umlal)                    \
+  V(umlal2, Umlal2)                  \
+  V(umlsl, Umlsl)                    \
+  V(umlsl2, Umlsl2)
+
+#define DEFINE_MACRO_ASM_FUNC(ASM, MASM)                                   \
+  void MASM(const VRegister& vd, const VRegister& vn, const VRegister& vm, \
+            int vm_index) {                                                \
+    DCHECK(allow_macro_instructions_);                                     \
+    ASM(vd, vn, vm, vm_index);                                             \
+  }
+  NEON_BYELEMENT_MACRO_LIST(DEFINE_MACRO_ASM_FUNC)
+#undef DEFINE_MACRO_ASM_FUNC
+
+#define NEON_2VREG_SHIFT_MACRO_LIST(V) \
+  V(rshrn, Rshrn)                      \
+  V(rshrn2, Rshrn2)                    \
+  V(shl, Shl)                          \
+  V(shll, Shll)                        \
+  V(shll2, Shll2)                      \
+  V(shrn, Shrn)                        \
+  V(shrn2, Shrn2)                      \
+  V(sli, Sli)                          \
+  V(sqrshrn, Sqrshrn)                  \
+  V(sqrshrn2, Sqrshrn2)                \
+  V(sqrshrun, Sqrshrun)                \
+  V(sqrshrun2, Sqrshrun2)              \
+  V(sqshl, Sqshl)                      \
+  V(sqshlu, Sqshlu)                    \
+  V(sqshrn, Sqshrn)                    \
+  V(sqshrn2, Sqshrn2)                  \
+  V(sqshrun, Sqshrun)                  \
+  V(sqshrun2, Sqshrun2)                \
+  V(sri, Sri)                          \
+  V(srshr, Srshr)                      \
+  V(srsra, Srsra)                      \
+  V(sshll, Sshll)                      \
+  V(sshll2, Sshll2)                    \
+  V(sshr, Sshr)                        \
+  V(ssra, Ssra)                        \
+  V(uqrshrn, Uqrshrn)                  \
+  V(uqrshrn2, Uqrshrn2)                \
+  V(uqshl, Uqshl)                      \
+  V(uqshrn, Uqshrn)                    \
+  V(uqshrn2, Uqshrn2)                  \
+  V(urshr, Urshr)                      \
+  V(ursra, Ursra)                      \
+  V(ushll, Ushll)                      \
+  V(ushll2, Ushll2)                    \
+  V(ushr, Ushr)                        \
+  V(usra, Usra)
+
+#define DEFINE_MACRO_ASM_FUNC(ASM, MASM)                           \
+  void MASM(const VRegister& vd, const VRegister& vn, int shift) { \
+    DCHECK(allow_macro_instructions_);                             \
+    ASM(vd, vn, shift);                                            \
+  }
+  NEON_2VREG_SHIFT_MACRO_LIST(DEFINE_MACRO_ASM_FUNC)
+#undef DEFINE_MACRO_ASM_FUNC
+
+  void Ld1(const VRegister& vt, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld1(vt, src);
+  }
+  void Ld1(const VRegister& vt, const VRegister& vt2, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld1(vt, vt2, src);
+  }
+  void Ld1(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld1(vt, vt2, vt3, src);
+  }
+  void Ld1(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           const VRegister& vt4, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld1(vt, vt2, vt3, vt4, src);
+  }
+  void Ld1(const VRegister& vt, int lane, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld1(vt, lane, src);
+  }
+  void Ld1r(const VRegister& vt, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld1r(vt, src);
+  }
+  void Ld2(const VRegister& vt, const VRegister& vt2, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld2(vt, vt2, src);
+  }
+  void Ld2(const VRegister& vt, const VRegister& vt2, int lane,
+           const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld2(vt, vt2, lane, src);
+  }
+  void Ld2r(const VRegister& vt, const VRegister& vt2, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld2r(vt, vt2, src);
+  }
+  void Ld3(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld3(vt, vt2, vt3, src);
+  }
+  void Ld3(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           int lane, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld3(vt, vt2, vt3, lane, src);
+  }
+  void Ld3r(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+            const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld3r(vt, vt2, vt3, src);
+  }
+  void Ld4(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           const VRegister& vt4, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld4(vt, vt2, vt3, vt4, src);
+  }
+  void Ld4(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           const VRegister& vt4, int lane, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld4(vt, vt2, vt3, vt4, lane, src);
+  }
+  void Ld4r(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+            const VRegister& vt4, const MemOperand& src) {
+    DCHECK(allow_macro_instructions_);
+    ld4r(vt, vt2, vt3, vt4, src);
+  }
+  void St1(const VRegister& vt, const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st1(vt, dst);
+  }
+  void St1(const VRegister& vt, const VRegister& vt2, const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st1(vt, vt2, dst);
+  }
+  void St1(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st1(vt, vt2, vt3, dst);
+  }
+  void St1(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           const VRegister& vt4, const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st1(vt, vt2, vt3, vt4, dst);
+  }
+  void St1(const VRegister& vt, int lane, const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st1(vt, lane, dst);
+  }
+  void St2(const VRegister& vt, const VRegister& vt2, const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st2(vt, vt2, dst);
+  }
+  void St3(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st3(vt, vt2, vt3, dst);
+  }
+  void St4(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           const VRegister& vt4, const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st4(vt, vt2, vt3, vt4, dst);
+  }
+  void St2(const VRegister& vt, const VRegister& vt2, int lane,
+           const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st2(vt, vt2, lane, dst);
+  }
+  void St3(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           int lane, const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st3(vt, vt2, vt3, lane, dst);
+  }
+  void St4(const VRegister& vt, const VRegister& vt2, const VRegister& vt3,
+           const VRegister& vt4, int lane, const MemOperand& dst) {
+    DCHECK(allow_macro_instructions_);
+    st4(vt, vt2, vt3, vt4, lane, dst);
+  }
+  void Tbl(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+    DCHECK(allow_macro_instructions_);
+    tbl(vd, vn, vm);
+  }
+  void Tbl(const VRegister& vd, const VRegister& vn, const VRegister& vn2,
+           const VRegister& vm) {
+    DCHECK(allow_macro_instructions_);
+    tbl(vd, vn, vn2, vm);
+  }
+  void Tbl(const VRegister& vd, const VRegister& vn, const VRegister& vn2,
+           const VRegister& vn3, const VRegister& vm) {
+    DCHECK(allow_macro_instructions_);
+    tbl(vd, vn, vn2, vn3, vm);
+  }
+  void Tbl(const VRegister& vd, const VRegister& vn, const VRegister& vn2,
+           const VRegister& vn3, const VRegister& vn4, const VRegister& vm) {
+    DCHECK(allow_macro_instructions_);
+    tbl(vd, vn, vn2, vn3, vn4, vm);
+  }
+  void Tbx(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+    DCHECK(allow_macro_instructions_);
+    tbx(vd, vn, vm);
+  }
+  void Tbx(const VRegister& vd, const VRegister& vn, const VRegister& vn2,
+           const VRegister& vm) {
+    DCHECK(allow_macro_instructions_);
+    tbx(vd, vn, vn2, vm);
+  }
+  void Tbx(const VRegister& vd, const VRegister& vn, const VRegister& vn2,
+           const VRegister& vn3, const VRegister& vm) {
+    DCHECK(allow_macro_instructions_);
+    tbx(vd, vn, vn2, vn3, vm);
+  }
+  void Tbx(const VRegister& vd, const VRegister& vn, const VRegister& vn2,
+           const VRegister& vn3, const VRegister& vn4, const VRegister& vm) {
+    DCHECK(allow_macro_instructions_);
+    tbx(vd, vn, vn2, vn3, vn4, vm);
+  }
+
  // Pseudo-instructions ------------------------------------------------------

  // Compute rd = abs(rm).
@ -623,7 +1198,7 @@ class MacroAssembler : public Assembler {
           const CPURegister& dst2, const CPURegister& dst3,
           const CPURegister& dst4, const CPURegister& dst5 = NoReg,
           const CPURegister& dst6 = NoReg, const CPURegister& dst7 = NoReg);
-  void Push(const Register& src0, const FPRegister& src1);
+  void Push(const Register& src0, const VRegister& src1);

  // Alternative forms of Push and Pop, taking a RegList or CPURegList that
  // specifies the registers that are to be pushed or popped. Higher-numbered
@ -659,16 +1234,16 @@ class MacroAssembler : public Assembler {
    PopSizeRegList(regs, kWRegSizeInBits);
  }
  inline void PushDRegList(RegList regs) {
-    PushSizeRegList(regs, kDRegSizeInBits, CPURegister::kFPRegister);
+    PushSizeRegList(regs, kDRegSizeInBits, CPURegister::kVRegister);
  }
  inline void PopDRegList(RegList regs) {
-    PopSizeRegList(regs, kDRegSizeInBits, CPURegister::kFPRegister);
+    PopSizeRegList(regs, kDRegSizeInBits, CPURegister::kVRegister);
  }
  inline void PushSRegList(RegList regs) {
-    PushSizeRegList(regs, kSRegSizeInBits, CPURegister::kFPRegister);
+    PushSizeRegList(regs, kSRegSizeInBits, CPURegister::kVRegister);
  }
  inline void PopSRegList(RegList regs) {
-    PopSizeRegList(regs, kSRegSizeInBits, CPURegister::kFPRegister);
+    PopSizeRegList(regs, kSRegSizeInBits, CPURegister::kVRegister);
  }

  // Push the specified register 'count' times.
@ -904,10 +1479,8 @@ class MacroAssembler : public Assembler {
  inline void InitializeRootRegister();

  void AssertFPCRState(Register fpcr = NoReg);
-  void CanonicalizeNaN(const FPRegister& dst, const FPRegister& src);
-  void CanonicalizeNaN(const FPRegister& reg) {
-    CanonicalizeNaN(reg, reg);
-  }
+  void CanonicalizeNaN(const VRegister& dst, const VRegister& src);
+  void CanonicalizeNaN(const VRegister& reg) { CanonicalizeNaN(reg, reg); }

  // Load an object from the root table.
  void LoadRoot(CPURegister destination,
@ -957,11 +1530,9 @@ class MacroAssembler : public Assembler {
  inline void SmiTag(Register smi);
  inline void SmiUntag(Register dst, Register src);
  inline void SmiUntag(Register smi);
-  inline void SmiUntagToDouble(FPRegister dst,
-                               Register src,
+  inline void SmiUntagToDouble(VRegister dst, Register src,
                               UntagMode mode = kNotSpeculativeUntag);
-  inline void SmiUntagToFloat(FPRegister dst,
-                              Register src,
+  inline void SmiUntagToFloat(VRegister dst, Register src,
                              UntagMode mode = kNotSpeculativeUntag);

  // Tag and push in one step.
@ -1043,9 +1614,8 @@ class MacroAssembler : public Assembler {
  // are represented as 0 and handled as a success.
  //
  // On output the Z flag is set if the operation was successful.
-  void TryRepresentDoubleAsInt32(Register as_int,
-                                 FPRegister value,
-                                 FPRegister scratch_d,
+  void TryRepresentDoubleAsInt32(Register as_int, VRegister value,
+                                 VRegister scratch_d,
                                 Label* on_successful_conversion = NULL,
                                 Label* on_failed_conversion = NULL) {
    DCHECK(as_int.Is32Bits());
@ -1058,9 +1628,8 @@ class MacroAssembler : public Assembler {
  // are represented as 0 and handled as a success.
  //
  // On output the Z flag is set if the operation was successful.
-  void TryRepresentDoubleAsInt64(Register as_int,
-                                 FPRegister value,
-                                 FPRegister scratch_d,
+  void TryRepresentDoubleAsInt64(Register as_int, VRegister value,
+                                 VRegister scratch_d,
                                 Label* on_successful_conversion = NULL,
                                 Label* on_failed_conversion = NULL) {
    DCHECK(as_int.Is64Bits());
@ -1323,11 +1892,9 @@ class MacroAssembler : public Assembler {
  // All registers are clobbered.
  // If no heap_number_map register is provided, the function will take care of
  // loading it.
-  void AllocateHeapNumber(Register result,
-                          Label* gc_required,
-                          Register scratch1,
-                          Register scratch2,
-                          CPURegister value = NoFPReg,
+  void AllocateHeapNumber(Register result, Label* gc_required,
+                          Register scratch1, Register scratch2,
+                          CPURegister value = NoVReg,
                          CPURegister heap_number_map = NoReg,
                          MutableMode mode = IMMUTABLE);

@ -1800,7 +2367,7 @@ class MacroAssembler : public Assembler {
  // Like printf, but print at run-time from generated code.
  //
  // The caller must ensure that arguments for floating-point placeholders
-  // (such as %e, %f or %g) are FPRegisters, and that arguments for integer
+  // (such as %e, %f or %g) are VRegisters, and that arguments for integer
  // placeholders are Registers.
  //
  // At the moment it is only possible to print the value of csp if it is the
@ -1894,6 +2461,10 @@ class MacroAssembler : public Assembler {
                 const CPURegister& dst0, const CPURegister& dst1,
                 const CPURegister& dst2, const CPURegister& dst3);

+  void Movi16bitHelper(const VRegister& vd, uint64_t imm);
+  void Movi32bitHelper(const VRegister& vd, uint64_t imm);
+  void Movi64bitHelper(const VRegister& vd, uint64_t imm);
+
  // Call Printf. On a native build, a simple call will be generated, but if the
  // simulator is being used then a suitable pseudo-instruction is used. The
  // arguments and stack (csp) must be prepared by the caller as for a normal
@ -1918,9 +2489,8 @@ class MacroAssembler : public Assembler {
  // important it must be checked separately.
  //
  // On output the Z flag is set if the operation was successful.
-  void TryRepresentDoubleAsInt(Register as_int,
-                               FPRegister value,
-                               FPRegister scratch_d,
+  void TryRepresentDoubleAsInt(Register as_int, VRegister value,
+                               VRegister scratch_d,
                               Label* on_successful_conversion = NULL,
                               Label* on_failed_conversion = NULL);

@ -2040,8 +2610,8 @@ class UseScratchRegisterScope {
        availablefp_(masm->FPTmpList()),
        old_available_(available_->list()),
        old_availablefp_(availablefp_->list()) {
-    DCHECK(available_->type() == CPURegister::kRegister);
-    DCHECK(availablefp_->type() == CPURegister::kFPRegister);
+    DCHECK_EQ(available_->type(), CPURegister::kRegister);
+    DCHECK_EQ(availablefp_->type(), CPURegister::kVRegister);
  }

  ~UseScratchRegisterScope();
@ -2050,15 +2620,15 @@ class UseScratchRegisterScope {
  // automatically when the scope ends.
  Register AcquireW() { return AcquireNextAvailable(available_).W(); }
  Register AcquireX() { return AcquireNextAvailable(available_).X(); }
-  FPRegister AcquireS() { return AcquireNextAvailable(availablefp_).S(); }
-  FPRegister AcquireD() { return AcquireNextAvailable(availablefp_).D(); }
+  VRegister AcquireS() { return AcquireNextAvailable(availablefp_).S(); }
+  VRegister AcquireD() { return AcquireNextAvailable(availablefp_).D(); }

  Register UnsafeAcquire(const Register& reg) {
    return Register(UnsafeAcquire(available_, reg));
  }

  Register AcquireSameSizeAs(const Register& reg);
-  FPRegister AcquireSameSizeAs(const FPRegister& reg);
+  VRegister AcquireSameSizeAs(const VRegister& reg);

 private:
  static CPURegister AcquireNextAvailable(CPURegList* available);
@ -2067,11 +2637,11 @@ class UseScratchRegisterScope {

  // Available scratch registers.
  CPURegList* available_;     // kRegister
-  CPURegList* availablefp_;   // kFPRegister
+  CPURegList* availablefp_;   // kVRegister

  // The state of the available lists at the start of this scope.
  RegList old_available_;     // kRegister
-  RegList old_availablefp_;   // kFPRegister
+  RegList old_availablefp_;   // kVRegister
 };

 MemOperand ContextMemOperand(Register context, int index = 0);
--- a/src/arm64/simulator-arm64.cc
+++ b/src/arm64/simulator-arm64.cc
--- a/src/arm64/simulator-arm64.h
+++ b/src/arm64/simulator-arm64.h
--- a/src/arm64/simulator-logic-arm64.cc
+++ b/src/arm64/simulator-logic-arm64.cc
--- a/src/arm64/utils-arm64.cc
+++ b/src/arm64/utils-arm64.cc
@ -12,23 +12,78 @@ namespace internal {

 #define __ assm->

+uint32_t float_sign(float val) {
+  uint32_t bits = bit_cast<uint32_t>(val);
+  return unsigned_bitextract_32(31, 31, bits);
+}
+
+uint32_t float_exp(float val) {
+  uint32_t bits = bit_cast<uint32_t>(val);
+  return unsigned_bitextract_32(30, 23, bits);
+}
+
+uint32_t float_mantissa(float val) {
+  uint32_t bits = bit_cast<uint32_t>(val);
+  return unsigned_bitextract_32(22, 0, bits);
+}
+
+uint32_t double_sign(double val) {
+  uint64_t bits = bit_cast<uint64_t>(val);
+  return static_cast<uint32_t>(unsigned_bitextract_64(63, 63, bits));
+}
+
+uint32_t double_exp(double val) {
+  uint64_t bits = bit_cast<uint64_t>(val);
+  return static_cast<uint32_t>(unsigned_bitextract_64(62, 52, bits));
+}
+
+uint64_t double_mantissa(double val) {
+  uint64_t bits = bit_cast<uint64_t>(val);
+  return unsigned_bitextract_64(51, 0, bits);
+}
+
+float float_pack(uint32_t sign, uint32_t exp, uint32_t mantissa) {
+  uint32_t bits = sign << kFloatExponentBits | exp;
+  return bit_cast<float>((bits << kFloatMantissaBits) | mantissa);
+}
+
+double double_pack(uint64_t sign, uint64_t exp, uint64_t mantissa) {
+  uint64_t bits = sign << kDoubleExponentBits | exp;
+  return bit_cast<double>((bits << kDoubleMantissaBits) | mantissa);
+}
+
+int float16classify(float16 value) {
+  const uint16_t exponent_max = (1 << kFloat16ExponentBits) - 1;
+  const uint16_t exponent_mask = exponent_max << kFloat16MantissaBits;
+  const uint16_t mantissa_mask = (1 << kFloat16MantissaBits) - 1;
+
+  const uint16_t exponent = (value & exponent_mask) >> kFloat16MantissaBits;
+  const uint16_t mantissa = value & mantissa_mask;
+  if (exponent == 0) {
+    if (mantissa == 0) {
+      return FP_ZERO;
+    }
+    return FP_SUBNORMAL;
+  } else if (exponent == exponent_max) {
+    if (mantissa == 0) {
+      return FP_INFINITE;
+    }
+    return FP_NAN;
+  }
+  return FP_NORMAL;
+}

 int CountLeadingZeros(uint64_t value, int width) {
-  // TODO(jbramley): Optimize this for ARM64 hosts.
-  DCHECK((width == 32) || (width == 64));
-  int count = 0;
-  uint64_t bit_test = 1UL << (width - 1);
-  while ((count < width) && ((bit_test & value) == 0)) {
-    count++;
-    bit_test >>= 1;
+  DCHECK(base::bits::IsPowerOfTwo32(width) && (width <= 64));
+  if (value == 0) {
+    return width;
  }
-  return count;
+  return base::bits::CountLeadingZeros64(value << (64 - width));
 }


 int CountLeadingSignBits(int64_t value, int width) {
-  // TODO(jbramley): Optimize this for ARM64 hosts.
-  DCHECK((width == 32) || (width == 64));
+  DCHECK(base::bits::IsPowerOfTwo32(width) && (width <= 64));
  if (value >= 0) {
    return CountLeadingZeros(value, width) - 1;
  } else {
@ -38,43 +93,32 @@ int CountLeadingSignBits(int64_t value, int width) {


 int CountTrailingZeros(uint64_t value, int width) {
-  // TODO(jbramley): Optimize this for ARM64 hosts.
  DCHECK((width == 32) || (width == 64));
-  int count = 0;
-  while ((count < width) && (((value >> count) & 1) == 0)) {
-    count++;
+  if (width == 64) {
+    return static_cast<int>(base::bits::CountTrailingZeros64(value));
  }
-  return count;
+  return static_cast<int>(base::bits::CountTrailingZeros32(
+      static_cast<uint32_t>(value & 0xfffffffff)));
 }


 int CountSetBits(uint64_t value, int width) {
-  // TODO(jbramley): Would it be useful to allow other widths? The
-  // implementation already supports them.
  DCHECK((width == 32) || (width == 64));
+  if (width == 64) {
+    return static_cast<int>(base::bits::CountPopulation64(value));
+  }
+  return static_cast<int>(base::bits::CountPopulation32(
+      static_cast<uint32_t>(value & 0xfffffffff)));
+}

-  // Mask out unused bits to ensure that they are not counted.
-  value &= (0xffffffffffffffffUL >> (64-width));
+int LowestSetBitPosition(uint64_t value) {
+  DCHECK_NE(value, 0U);
+  return CountTrailingZeros(value, 64) + 1;
+}

-  // Add up the set bits.
-  // The algorithm works by adding pairs of bit fields together iteratively,
-  // where the size of each bit field doubles each time.
-  // An example for an 8-bit value:
-  // Bits:  h  g  f  e  d  c  b  a
-  //         \ |   \ |   \ |   \ |
-  // value = h+g   f+e   d+c   b+a
-  //            \    |      \    |
-  // value =   h+g+f+e     d+c+b+a
-  //                  \          |
-  // value =       h+g+f+e+d+c+b+a
-  value = ((value >> 1) & 0x5555555555555555) + (value & 0x5555555555555555);
-  value = ((value >> 2) & 0x3333333333333333) + (value & 0x3333333333333333);
-  value = ((value >> 4) & 0x0f0f0f0f0f0f0f0f) + (value & 0x0f0f0f0f0f0f0f0f);
-  value = ((value >> 8) & 0x00ff00ff00ff00ff) + (value & 0x00ff00ff00ff00ff);
-  value = ((value >> 16) & 0x0000ffff0000ffff) + (value & 0x0000ffff0000ffff);
-  value = ((value >> 32) & 0x00000000ffffffff) + (value & 0x00000000ffffffff);
-
-  return static_cast<int>(value);
+int HighestSetBitPosition(uint64_t value) {
+  DCHECK_NE(value, 0U);
+  return 63 - CountLeadingZeros(value, 64);
 }


@ -84,7 +128,7 @@ uint64_t LargestPowerOf2Divisor(uint64_t value) {


 int MaskToBit(uint64_t mask) {
-  DCHECK(CountSetBits(mask, 64) == 1);
+  DCHECK_EQ(CountSetBits(mask, 64), 1);
  return CountTrailingZeros(mask, 64);
 }

--- a/src/arm64/utils-arm64.h
+++ b/src/arm64/utils-arm64.h
@ -8,6 +8,7 @@
 #include <cmath>

 #include "src/arm64/constants-arm64.h"
+#include "src/utils.h"

 namespace v8 {
 namespace internal {
@ -16,40 +17,26 @@ namespace internal {
 STATIC_ASSERT((static_cast<int32_t>(-1) >> 1) == -1);
 STATIC_ASSERT((static_cast<uint32_t>(-1) >> 1) == 0x7FFFFFFF);

-// Floating point representation.
-static inline uint32_t float_to_rawbits(float value) {
-  uint32_t bits = 0;
-  memcpy(&bits, &value, 4);
-  return bits;
-}
+uint32_t float_sign(float val);
+uint32_t float_exp(float val);
+uint32_t float_mantissa(float val);
+uint32_t double_sign(double val);
+uint32_t double_exp(double val);
+uint64_t double_mantissa(double val);

+float float_pack(uint32_t sign, uint32_t exp, uint32_t mantissa);
+double double_pack(uint64_t sign, uint64_t exp, uint64_t mantissa);

-static inline uint64_t double_to_rawbits(double value) {
-  uint64_t bits = 0;
-  memcpy(&bits, &value, 8);
-  return bits;
-}
-
-
-static inline float rawbits_to_float(uint32_t bits) {
-  float value = 0.0;
-  memcpy(&value, &bits, 4);
-  return value;
-}
-
-
-static inline double rawbits_to_double(uint64_t bits) {
-  double value = 0.0;
-  memcpy(&value, &bits, 8);
-  return value;
-}
-
+// An fpclassify() function for 16-bit half-precision floats.
+int float16classify(float16 value);

 // Bit counting.
 int CountLeadingZeros(uint64_t value, int width);
 int CountLeadingSignBits(int64_t value, int width);
 int CountTrailingZeros(uint64_t value, int width);
 int CountSetBits(uint64_t value, int width);
+int LowestSetBitPosition(uint64_t value);
+int HighestSetBitPosition(uint64_t value);
 uint64_t LargestPowerOf2Divisor(uint64_t value);
 int MaskToBit(uint64_t mask);

@ -86,7 +73,7 @@ T ReverseBytes(T value, int block_bytes_log2) {

 // NaN tests.
 inline bool IsSignallingNaN(double num) {
-  uint64_t raw = double_to_rawbits(num);
+  uint64_t raw = bit_cast<uint64_t>(num);
  if (std::isnan(num) && ((raw & kDQuietNanMask) == 0)) {
    return true;
  }
@ -95,13 +82,17 @@ inline bool IsSignallingNaN(double num) {


 inline bool IsSignallingNaN(float num) {
-  uint32_t raw = float_to_rawbits(num);
+  uint32_t raw = bit_cast<uint32_t>(num);
  if (std::isnan(num) && ((raw & kSQuietNanMask) == 0)) {
    return true;
  }
  return false;
 }

+inline bool IsSignallingNaN(float16 num) {
+  const uint16_t kFP16QuietNaNMask = 0x0200;
+  return (float16classify(num) == FP_NAN) && ((num & kFP16QuietNaNMask) == 0);
+}

 template <typename T>
 inline bool IsQuietNaN(T num) {
@ -112,13 +103,14 @@ inline bool IsQuietNaN(T num) {
 // Convert the NaN in 'num' to a quiet NaN.
 inline double ToQuietNaN(double num) {
  DCHECK(std::isnan(num));
-  return rawbits_to_double(double_to_rawbits(num) | kDQuietNanMask);
+  return bit_cast<double>(bit_cast<uint64_t>(num) | kDQuietNanMask);
 }


 inline float ToQuietNaN(float num) {
  DCHECK(std::isnan(num));
-  return rawbits_to_float(float_to_rawbits(num) | kSQuietNanMask);
+  return bit_cast<float>(bit_cast<uint32_t>(num) |
+                         static_cast<uint32_t>(kSQuietNanMask));
 }


--- a/src/compiler/arm64/code-generator-arm64.cc
+++ b/src/compiler/arm64/code-generator-arm64.cc
@ -253,7 +253,7 @@ class Arm64OperandConverter final : public InstructionOperandConverter {
      int from_sp = offset.offset() + frame_access_state()->GetSPToFPOffset();
      // Convert FP-offsets to SP-offsets if it results in better code.
      if (Assembler::IsImmLSUnscaled(from_sp) ||
-          Assembler::IsImmLSScaled(from_sp, LSDoubleWord)) {
+          Assembler::IsImmLSScaled(from_sp, 3)) {
        offset = FrameOffset::FromStackPointer(from_sp);
      }
    }
@ -1945,11 +1945,11 @@ void CodeGenerator::FinishFrame(Frame* frame) {
  }

  // Save FP registers.
-  CPURegList saves_fp = CPURegList(CPURegister::kFPRegister, kDRegSizeInBits,
+  CPURegList saves_fp = CPURegList(CPURegister::kVRegister, kDRegSizeInBits,
                                   descriptor->CalleeSavedFPRegisters());
  int saved_count = saves_fp.Count();
  if (saved_count != 0) {
-    DCHECK(saves_fp.list() == CPURegList::GetCalleeSavedFP().list());
+    DCHECK(saves_fp.list() == CPURegList::GetCalleeSavedV().list());
    frame->AllocateSavedCalleeRegisterSlots(saved_count *
                                            (kDoubleSize / kPointerSize));
  }
@ -2068,11 +2068,11 @@ void CodeGenerator::AssembleConstructFrame() {
  }

  // Save FP registers.
-  CPURegList saves_fp = CPURegList(CPURegister::kFPRegister, kDRegSizeInBits,
+  CPURegList saves_fp = CPURegList(CPURegister::kVRegister, kDRegSizeInBits,
                                   descriptor->CalleeSavedFPRegisters());
  int saved_count = saves_fp.Count();
  if (saved_count != 0) {
-    DCHECK(saves_fp.list() == CPURegList::GetCalleeSavedFP().list());
+    DCHECK(saves_fp.list() == CPURegList::GetCalleeSavedV().list());
    __ PushCPURegList(saves_fp);
  }
  // Save registers.
@ -2098,7 +2098,7 @@ void CodeGenerator::AssembleReturn(InstructionOperand* pop) {
  }

  // Restore fp registers.
-  CPURegList saves_fp = CPURegList(CPURegister::kFPRegister, kDRegSizeInBits,
+  CPURegList saves_fp = CPURegList(CPURegister::kVRegister, kDRegSizeInBits,
                                   descriptor->CalleeSavedFPRegisters());
  if (saves_fp.Count() != 0) {
    __ PopCPURegList(saves_fp);
@ -2198,7 +2198,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
      }
    } else if (src.type() == Constant::kFloat32) {
      if (destination->IsFPRegister()) {
-        FPRegister dst = g.ToDoubleRegister(destination).S();
+        VRegister dst = g.ToDoubleRegister(destination).S();
        __ Fmov(dst, src.ToFloat32());
      } else {
        DCHECK(destination->IsFPStackSlot());
@ -2206,7 +2206,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
          __ Str(wzr, g.ToMemOperand(destination, masm()));
        } else {
          UseScratchRegisterScope scope(masm());
-          FPRegister temp = scope.AcquireS();
+          VRegister temp = scope.AcquireS();
          __ Fmov(temp, src.ToFloat32());
          __ Str(temp, g.ToMemOperand(destination, masm()));
        }
@ -2214,7 +2214,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
    } else {
      DCHECK_EQ(Constant::kFloat64, src.type());
      if (destination->IsFPRegister()) {
-        FPRegister dst = g.ToDoubleRegister(destination);
+        VRegister dst = g.ToDoubleRegister(destination);
        __ Fmov(dst, src.ToFloat64());
      } else {
        DCHECK(destination->IsFPStackSlot());
@ -2222,16 +2222,16 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
          __ Str(xzr, g.ToMemOperand(destination, masm()));
        } else {
          UseScratchRegisterScope scope(masm());
-          FPRegister temp = scope.AcquireD();
+          VRegister temp = scope.AcquireD();
          __ Fmov(temp, src.ToFloat64());
          __ Str(temp, g.ToMemOperand(destination, masm()));
        }
      }
    }
  } else if (source->IsFPRegister()) {
-    FPRegister src = g.ToDoubleRegister(source);
+    VRegister src = g.ToDoubleRegister(source);
    if (destination->IsFPRegister()) {
-      FPRegister dst = g.ToDoubleRegister(destination);
+      VRegister dst = g.ToDoubleRegister(destination);
      __ Fmov(dst, src);
    } else {
      DCHECK(destination->IsFPStackSlot());
@ -2244,7 +2244,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
      __ Ldr(g.ToDoubleRegister(destination), src);
    } else {
      UseScratchRegisterScope scope(masm());
-      FPRegister temp = scope.AcquireD();
+      VRegister temp = scope.AcquireD();
      __ Ldr(temp, src);
      __ Str(temp, g.ToMemOperand(destination, masm()));
    }
@ -2288,10 +2288,10 @@ void CodeGenerator::AssembleSwap(InstructionOperand* source,
    __ Str(temp_1, src);
  } else if (source->IsFPRegister()) {
    UseScratchRegisterScope scope(masm());
-    FPRegister temp = scope.AcquireD();
-    FPRegister src = g.ToDoubleRegister(source);
+    VRegister temp = scope.AcquireD();
+    VRegister src = g.ToDoubleRegister(source);
    if (destination->IsFPRegister()) {
-      FPRegister dst = g.ToDoubleRegister(destination);
+      VRegister dst = g.ToDoubleRegister(destination);
      __ Fmov(temp, src);
      __ Fmov(src, dst);
      __ Fmov(dst, temp);
--- a/src/compiler/arm64/instruction-selector-arm64.cc
+++ b/src/compiler/arm64/instruction-selector-arm64.cc
@ -103,13 +103,13 @@ class Arm64OperandGenerator final : public OperandGenerator {
      case kArithmeticImm:
        return Assembler::IsImmAddSub(value);
      case kLoadStoreImm8:
-        return IsLoadStoreImmediate(value, LSByte);
+        return IsLoadStoreImmediate(value, 0);
      case kLoadStoreImm16:
-        return IsLoadStoreImmediate(value, LSHalfword);
+        return IsLoadStoreImmediate(value, 1);
      case kLoadStoreImm32:
-        return IsLoadStoreImmediate(value, LSWord);
+        return IsLoadStoreImmediate(value, 2);
      case kLoadStoreImm64:
-        return IsLoadStoreImmediate(value, LSDoubleWord);
+        return IsLoadStoreImmediate(value, 3);
      case kNoImmediate:
        return false;
      case kShift32Imm:  // Fall through.
@ -130,7 +130,7 @@ class Arm64OperandGenerator final : public OperandGenerator {
  }

 private:
-  bool IsLoadStoreImmediate(int64_t value, LSDataSize size) {
+  bool IsLoadStoreImmediate(int64_t value, unsigned size) {
    return Assembler::IsImmLSScaled(value, size) ||
           Assembler::IsImmLSUnscaled(value);
  }
--- a/src/crankshaft/arm64/delayed-masm-arm64-inl.h
+++ b/src/crankshaft/arm64/delayed-masm-arm64-inl.h
@ -44,14 +44,12 @@ void DelayedMasm::Mov(const Register& rd,
  __ Mov(rd, operand, discard_mode);
 }

-
-void DelayedMasm::Fmov(FPRegister fd, FPRegister fn) {
+void DelayedMasm::Fmov(VRegister fd, VRegister fn) {
  EmitPending();
  __ Fmov(fd, fn);
 }

-
-void DelayedMasm::Fmov(FPRegister fd, double imm) {
+void DelayedMasm::Fmov(VRegister fd, double imm) {
  EmitPending();
  __ Fmov(fd, imm);
 }
--- a/src/crankshaft/arm64/delayed-masm-arm64.h
+++ b/src/crankshaft/arm64/delayed-masm-arm64.h
@ -61,8 +61,8 @@ class DelayedMasm BASE_EMBEDDED {
  inline void Mov(const Register& rd,
                  const Operand& operand,
                  DiscardMoveMode discard_mode = kDontDiscardForSameWReg);
-  inline void Fmov(FPRegister fd, FPRegister fn);
-  inline void Fmov(FPRegister fd, double imm);
+  inline void Fmov(VRegister fd, VRegister fn);
+  inline void Fmov(VRegister fd, double imm);
  inline void LoadObject(Register result, Handle<Object> object);
  // Instructions which try to merge which the pending instructions.
  void StackSlotMove(LOperand* src, LOperand* dst);
--- a/src/crankshaft/arm64/lithium-codegen-arm64.cc
+++ b/src/crankshaft/arm64/lithium-codegen-arm64.cc
@ -179,9 +179,9 @@ class TestAndBranch : public BranchGenerator {
 // Test the input and branch if it is non-zero and not a NaN.
 class BranchIfNonZeroNumber : public BranchGenerator {
 public:
-  BranchIfNonZeroNumber(LCodeGen* codegen, const FPRegister& value,
-                        const FPRegister& scratch)
-    : BranchGenerator(codegen), value_(value), scratch_(scratch) { }
+  BranchIfNonZeroNumber(LCodeGen* codegen, const VRegister& value,
+                        const VRegister& scratch)
+      : BranchGenerator(codegen), value_(value), scratch_(scratch) {}

  virtual void Emit(Label* label) const {
    __ Fabs(scratch_, value_);
@ -198,8 +198,8 @@ class BranchIfNonZeroNumber : public BranchGenerator {
  }

 private:
-  const FPRegister& value_;
-  const FPRegister& scratch_;
+  const VRegister& value_;
+  const VRegister& scratch_;
 };


@ -547,7 +547,7 @@ void LCodeGen::SaveCallerDoubles() {
  while (!iterator.Done()) {
    // TODO(all): Is this supposed to save just the callee-saved doubles? It
    // looks like it's saving all of them.
-    FPRegister value = FPRegister::from_code(iterator.Current());
+    VRegister value = VRegister::from_code(iterator.Current());
    __ Poke(value, count * kDoubleSize);
    iterator.Advance();
    count++;
@ -565,7 +565,7 @@ void LCodeGen::RestoreCallerDoubles() {
  while (!iterator.Done()) {
    // TODO(all): Is this supposed to restore just the callee-saved doubles? It
    // looks like it's restoring all of them.
-    FPRegister value = FPRegister::from_code(iterator.Current());
+    VRegister value = VRegister::from_code(iterator.Current());
    __ Peek(value, count * kDoubleSize);
    iterator.Advance();
    count++;
@ -1133,7 +1133,7 @@ MemOperand LCodeGen::ToMemOperand(LOperand* op, StackMode stack_mode) const {
          (pushed_arguments_ + GetTotalFrameSlotCount()) * kPointerSize -
          StandardFrameConstants::kFixedFrameSizeAboveFp;
      int jssp_offset = fp_offset + jssp_offset_to_fp;
-      if (masm()->IsImmLSScaled(jssp_offset, LSDoubleWord)) {
+      if (masm()->IsImmLSScaled(jssp_offset, kPointerSizeLog2)) {
        return MemOperand(masm()->StackPointer(), jssp_offset);
      }
    }
@ -1272,11 +1272,10 @@ void LCodeGen::EmitTestAndBranch(InstrType instr,
  EmitBranchGeneric(instr, branch);
 }

-
-template<class InstrType>
+template <class InstrType>
 void LCodeGen::EmitBranchIfNonZeroNumber(InstrType instr,
-                                         const FPRegister& value,
-                                         const FPRegister& scratch) {
+                                         const VRegister& value,
+                                         const VRegister& scratch) {
  BranchIfNonZeroNumber branch(this, value, scratch);
  EmitBranchGeneric(instr, branch);
 }
@ -2277,7 +2276,7 @@ void LCodeGen::DoClassOfTestAndBranch(LClassOfTestAndBranch* instr) {

 void LCodeGen::DoCmpHoleAndBranchD(LCmpHoleAndBranchD* instr) {
  DCHECK(instr->hydrogen()->representation().IsDouble());
-  FPRegister object = ToDoubleRegister(instr->object());
+  VRegister object = ToDoubleRegister(instr->object());
  Register temp = ToRegister(instr->temp());

  // If we don't have a NaN, we don't have the hole, so branch now to avoid the
@ -3274,7 +3273,7 @@ void LCodeGen::DoLoadNamedField(LLoadNamedField* instr) {

  if (instr->hydrogen()->representation().IsDouble()) {
    DCHECK(access.IsInobject());
-    FPRegister result = ToDoubleRegister(instr->result());
+    VRegister result = ToDoubleRegister(instr->result());
    __ Ldr(result, FieldMemOperand(object, offset));
    return;
  }
@ -3434,7 +3433,7 @@ void LCodeGen::DoMathAbsTagged(LMathAbsTagged* instr) {

  // The result is the magnitude (abs) of the smallest value a smi can
  // represent, encoded as a double.
-  __ Mov(result_bits, double_to_rawbits(0x80000000));
+  __ Mov(result_bits, bit_cast<uint64_t>(static_cast<double>(0x80000000)));
  __ B(deferred->allocation_entry());

  __ Bind(deferred->exit());
@ -4976,7 +4975,7 @@ void LCodeGen::DoStoreNamedField(LStoreNamedField* instr) {
    DCHECK(access.IsInobject());
    DCHECK(!instr->hydrogen()->has_transition());
    DCHECK(!instr->hydrogen()->NeedsWriteBarrier());
-    FPRegister value = ToDoubleRegister(instr->value());
+    VRegister value = ToDoubleRegister(instr->value());
    __ Str(value, FieldMemOperand(object, offset));
    return;
  }
@ -5014,7 +5013,7 @@ void LCodeGen::DoStoreNamedField(LStoreNamedField* instr) {

  if (FLAG_unbox_double_fields && representation.IsDouble()) {
    DCHECK(access.IsInobject());
-    FPRegister value = ToDoubleRegister(instr->value());
+    VRegister value = ToDoubleRegister(instr->value());
    __ Str(value, FieldMemOperand(object, offset));
  } else if (representation.IsSmi() &&
             instr->hydrogen()->value()->representation().IsInteger32()) {
--- a/src/crankshaft/arm64/lithium-codegen-arm64.h
+++ b/src/crankshaft/arm64/lithium-codegen-arm64.h
@ -159,10 +159,9 @@ class LCodeGen: public LCodeGenBase {
                         const Register& value,
                         uint64_t mask);

-  template<class InstrType>
-  void EmitBranchIfNonZeroNumber(InstrType instr,
-                                 const FPRegister& value,
-                                 const FPRegister& scratch);
+  template <class InstrType>
+  void EmitBranchIfNonZeroNumber(InstrType instr, const VRegister& value,
+                                 const VRegister& scratch);

  template<class InstrType>
  void EmitBranchIfHeapNumber(InstrType instr,
--- a/src/crankshaft/arm64/lithium-gap-resolver-arm64.h
+++ b/src/crankshaft/arm64/lithium-gap-resolver-arm64.h
@ -68,7 +68,7 @@ class LGapResolver BASE_EMBEDDED {
  // These two methods switch from one mode to the other.
  void AcquireSavedValueRegister() { masm_.AcquireScratchRegister(); }
  void ReleaseSavedValueRegister() { masm_.ReleaseScratchRegister(); }
-  const FPRegister& SavedFPValueRegister() {
+  const VRegister& SavedFPValueRegister() {
    // We use the Crankshaft floating-point scratch register to break a cycle
    // involving double values as the MacroAssembler will not need it for the
    // operations performed by the gap resolver.
--- a/src/v8.gyp
+++ b/src/v8.gyp
@ -1595,6 +1595,7 @@
            'arm64/macro-assembler-arm64-inl.h',
            'arm64/simulator-arm64.cc',
            'arm64/simulator-arm64.h',
+            'arm64/simulator-logic-arm64.cc',
            'arm64/utils-arm64.cc',
            'arm64/utils-arm64.h',
            'arm64/eh-frame-arm64.cc',
--- a/test/cctest/test-assembler-arm64.cc
+++ b/test/cctest/test-assembler-arm64.cc
--- a/test/cctest/test-disasm-arm64.cc
+++ b/test/cctest/test-disasm-arm64.cc
--- a/test/cctest/test-utils-arm64.cc
+++ b/test/cctest/test-utils-arm64.cc
@ -59,19 +59,30 @@ bool Equal64(uint64_t expected, const RegisterDump*, uint64_t result) {
  return expected == result;
 }

+bool Equal128(vec128_t expected, const RegisterDump*, vec128_t result) {
+  if ((result.h != expected.h) || (result.l != expected.l)) {
+    printf("Expected 0x%016" PRIx64 "%016" PRIx64
+           "\t "
+           "Found 0x%016" PRIx64 "%016" PRIx64 "\n",
+           expected.h, expected.l, result.h, result.l);
+  }
+
+  return ((expected.h == result.h) && (expected.l == result.l));
+}

 bool EqualFP32(float expected, const RegisterDump*, float result) {
-  if (float_to_rawbits(expected) == float_to_rawbits(result)) {
+  if (bit_cast<uint32_t>(expected) == bit_cast<uint32_t>(result)) {
    return true;
  } else {
    if (std::isnan(expected) || (expected == 0.0)) {
      printf("Expected 0x%08" PRIx32 "\t Found 0x%08" PRIx32 "\n",
-             float_to_rawbits(expected), float_to_rawbits(result));
+             bit_cast<uint32_t>(expected), bit_cast<uint32_t>(result));
    } else {
-      printf("Expected %.9f (0x%08" PRIx32 ")\t "
+      printf("Expected %.9f (0x%08" PRIx32
+             ")\t "
             "Found %.9f (0x%08" PRIx32 ")\n",
-             expected, float_to_rawbits(expected),
-             result, float_to_rawbits(result));
+             expected, bit_cast<uint32_t>(expected), result,
+             bit_cast<uint32_t>(result));
    }
    return false;
  }
@ -79,18 +90,19 @@ bool EqualFP32(float expected, const RegisterDump*, float result) {


 bool EqualFP64(double expected, const RegisterDump*, double result) {
-  if (double_to_rawbits(expected) == double_to_rawbits(result)) {
+  if (bit_cast<uint64_t>(expected) == bit_cast<uint64_t>(result)) {
    return true;
  }

  if (std::isnan(expected) || (expected == 0.0)) {
    printf("Expected 0x%016" PRIx64 "\t Found 0x%016" PRIx64 "\n",
-           double_to_rawbits(expected), double_to_rawbits(result));
+           bit_cast<uint64_t>(expected), bit_cast<uint64_t>(result));
  } else {
-    printf("Expected %.17f (0x%016" PRIx64 ")\t "
+    printf("Expected %.17f (0x%016" PRIx64
+           ")\t "
           "Found %.17f (0x%016" PRIx64 ")\n",
-           expected, double_to_rawbits(expected),
-           result, double_to_rawbits(result));
+           expected, bit_cast<uint64_t>(expected), result,
+           bit_cast<uint64_t>(result));
  }
  return false;
 }
@ -119,27 +131,31 @@ bool Equal64(uint64_t expected,
  return Equal64(expected, core, result);
 }

+bool Equal128(uint64_t expected_h, uint64_t expected_l,
+              const RegisterDump* core, const VRegister& vreg) {
+  CHECK(vreg.Is128Bits());
+  vec128_t expected = {expected_l, expected_h};
+  vec128_t result = core->qreg(vreg.code());
+  return Equal128(expected, core, result);
+}

-bool EqualFP32(float expected,
-               const RegisterDump* core,
-               const FPRegister& fpreg) {
+bool EqualFP32(float expected, const RegisterDump* core,
+               const VRegister& fpreg) {
  CHECK(fpreg.Is32Bits());
  // Retrieve the corresponding D register so we can check that the upper part
  // was properly cleared.
  uint64_t result_64 = core->dreg_bits(fpreg.code());
  if ((result_64 & 0xffffffff00000000L) != 0) {
    printf("Expected 0x%08" PRIx32 " (%f)\t Found 0x%016" PRIx64 "\n",
-           float_to_rawbits(expected), expected, result_64);
+           bit_cast<uint32_t>(expected), expected, result_64);
    return false;
  }

  return EqualFP32(expected, core, core->sreg(fpreg.code()));
 }

-
-bool EqualFP64(double expected,
-               const RegisterDump* core,
-               const FPRegister& fpreg) {
+bool EqualFP64(double expected, const RegisterDump* core,
+               const VRegister& fpreg) {
  CHECK(fpreg.Is64Bits());
  return EqualFP64(expected, core, core->dreg(fpreg.code()));
 }
@ -198,7 +214,7 @@ bool EqualRegisters(const RegisterDump* a, const RegisterDump* b) {
    }
  }

-  for (unsigned i = 0; i < kNumberOfFPRegisters; i++) {
+  for (unsigned i = 0; i < kNumberOfVRegisters; i++) {
    uint64_t a_bits = a->dreg_bits(i);
    uint64_t b_bits = b->dreg_bits(i);
    if (a_bits != b_bits) {
@ -238,29 +254,28 @@ RegList PopulateRegisterArray(Register* w, Register* x, Register* r,
  return list;
 }

-
-RegList PopulateFPRegisterArray(FPRegister* s, FPRegister* d, FPRegister* v,
-                                int reg_size, int reg_count, RegList allowed) {
+RegList PopulateVRegisterArray(VRegister* s, VRegister* d, VRegister* v,
+                               int reg_size, int reg_count, RegList allowed) {
  RegList list = 0;
  int i = 0;
-  for (unsigned n = 0; (n < kNumberOfFPRegisters) && (i < reg_count); n++) {
+  for (unsigned n = 0; (n < kNumberOfVRegisters) && (i < reg_count); n++) {
    if (((1UL << n) & allowed) != 0) {
      // Only assigned allowed registers.
      if (v) {
-        v[i] = FPRegister::Create(n, reg_size);
+        v[i] = VRegister::Create(n, reg_size);
      }
      if (d) {
-        d[i] = FPRegister::Create(n, kDRegSizeInBits);
+        d[i] = VRegister::Create(n, kDRegSizeInBits);
      }
      if (s) {
-        s[i] = FPRegister::Create(n, kSRegSizeInBits);
+        s[i] = VRegister::Create(n, kSRegSizeInBits);
      }
      list |= (1UL << n);
      i++;
    }
  }
  // Check that we got enough registers.
-  CHECK(CountSetBits(list, kNumberOfFPRegisters) == reg_count);
+  CHECK(CountSetBits(list, kNumberOfVRegisters) == reg_count);

  return list;
 }
@ -290,10 +305,10 @@ void Clobber(MacroAssembler* masm, RegList reg_list, uint64_t const value) {


 void ClobberFP(MacroAssembler* masm, RegList reg_list, double const value) {
-  FPRegister first = NoFPReg;
-  for (unsigned i = 0; i < kNumberOfFPRegisters; i++) {
+  VRegister first = NoVReg;
+  for (unsigned i = 0; i < kNumberOfVRegisters; i++) {
    if (reg_list & (1UL << i)) {
-      FPRegister dn = FPRegister::Create(i, kDRegSizeInBits);
+      VRegister dn = VRegister::Create(i, kDRegSizeInBits);
      if (!first.IsValid()) {
        // This is the first register we've hit, so construct the literal.
        __ Fmov(dn, value);
@ -312,7 +327,7 @@ void Clobber(MacroAssembler* masm, CPURegList reg_list) {
  if (reg_list.type() == CPURegister::kRegister) {
    // This will always clobber X registers.
    Clobber(masm, reg_list.list());
-  } else if (reg_list.type() == CPURegister::kFPRegister) {
+  } else if (reg_list.type() == CPURegister::kVRegister) {
    // This will always clobber D registers.
    ClobberFP(masm, reg_list.list());
  } else {
@ -343,6 +358,7 @@ void RegisterDump::Dump(MacroAssembler* masm) {
  const int w_offset = offsetof(dump_t, w_);
  const int d_offset = offsetof(dump_t, d_);
  const int s_offset = offsetof(dump_t, s_);
+  const int q_offset = offsetof(dump_t, q_);
  const int sp_offset = offsetof(dump_t, sp_);
  const int wsp_offset = offsetof(dump_t, wsp_);
  const int flags_offset = offsetof(dump_t, flags_);
@ -377,18 +393,25 @@ void RegisterDump::Dump(MacroAssembler* masm) {

  // Dump D registers.
  __ Add(dump, dump_base, d_offset);
-  for (unsigned i = 0; i < kNumberOfFPRegisters; i += 2) {
-    __ Stp(FPRegister::DRegFromCode(i), FPRegister::DRegFromCode(i + 1),
+  for (unsigned i = 0; i < kNumberOfVRegisters; i += 2) {
+    __ Stp(VRegister::DRegFromCode(i), VRegister::DRegFromCode(i + 1),
           MemOperand(dump, i * kDRegSize));
  }

  // Dump S registers.
  __ Add(dump, dump_base, s_offset);
-  for (unsigned i = 0; i < kNumberOfFPRegisters; i += 2) {
-    __ Stp(FPRegister::SRegFromCode(i), FPRegister::SRegFromCode(i + 1),
+  for (unsigned i = 0; i < kNumberOfVRegisters; i += 2) {
+    __ Stp(VRegister::SRegFromCode(i), VRegister::SRegFromCode(i + 1),
           MemOperand(dump, i * kSRegSize));
  }

+  // Dump Q registers.
+  __ Add(dump, dump_base, q_offset);
+  for (unsigned i = 0; i < kNumberOfVRegisters; i += 2) {
+    __ Stp(VRegister::QRegFromCode(i), VRegister::QRegFromCode(i + 1),
+           MemOperand(dump, i * kQRegSize));
+  }
+
  // Dump the flags.
  __ Mrs(tmp, NZCV);
  __ Str(tmp, MemOperand(dump_base, flags_offset));
--- a/test/cctest/test-utils-arm64.h
+++ b/test/cctest/test-utils-arm64.h
@ -39,6 +39,11 @@
 namespace v8 {
 namespace internal {

+// Structure representing Q registers in a RegisterDump.
+struct vec128_t {
+  uint64_t l;
+  uint64_t h;
+};

 // RegisterDump: Object allowing integer, floating point and flags registers
 // to be saved to itself for future reference.
@ -72,14 +77,14 @@ class RegisterDump {
    return dump_.x_[code];
  }

-  // FPRegister accessors.
+  // VRegister accessors.
  inline uint32_t sreg_bits(unsigned code) const {
    CHECK(FPRegAliasesMatch(code));
    return dump_.s_[code];
  }

  inline float sreg(unsigned code) const {
-    return rawbits_to_float(sreg_bits(code));
+    return bit_cast<float>(sreg_bits(code));
  }

  inline uint64_t dreg_bits(unsigned code) const {
@ -88,9 +93,11 @@ class RegisterDump {
  }

  inline double dreg(unsigned code) const {
-    return rawbits_to_double(dreg_bits(code));
+    return bit_cast<double>(dreg_bits(code));
  }

+  inline vec128_t qreg(unsigned code) const { return dump_.q_[code]; }
+
  // Stack pointer accessors.
  inline int64_t spreg() const {
    CHECK(SPRegAliasesMatch());
@ -135,7 +142,7 @@ class RegisterDump {
  // As RegAliasesMatch, but for floating-point registers.
  bool FPRegAliasesMatch(unsigned code) const {
    CHECK(IsComplete());
-    CHECK(code < kNumberOfFPRegisters);
+    CHECK(code < kNumberOfVRegisters);
    return (dump_.d_[code] & kSRegMask) == dump_.s_[code];
  }

@ -147,8 +154,11 @@ class RegisterDump {
    uint32_t w_[kNumberOfRegisters];

    // Floating-point registers, as raw bits.
-    uint64_t d_[kNumberOfFPRegisters];
-    uint32_t s_[kNumberOfFPRegisters];
+    uint64_t d_[kNumberOfVRegisters];
+    uint32_t s_[kNumberOfVRegisters];
+
+    // Vector registers.
+    vec128_t q_[kNumberOfVRegisters];

    // The stack pointer.
    uint64_t sp_;
@ -163,12 +173,18 @@ class RegisterDump {
  } dump_;

  static dump_t for_sizeof();
-  STATIC_ASSERT(sizeof(for_sizeof().d_[0]) == kDRegSize);
-  STATIC_ASSERT(sizeof(for_sizeof().s_[0]) == kSRegSize);
-  STATIC_ASSERT(sizeof(for_sizeof().d_[0]) == kXRegSize);
-  STATIC_ASSERT(sizeof(for_sizeof().s_[0]) == kWRegSize);
-  STATIC_ASSERT(sizeof(for_sizeof().x_[0]) == kXRegSize);
-  STATIC_ASSERT(sizeof(for_sizeof().w_[0]) == kWRegSize);
+  static_assert(kXRegSize == kDRegSize, "X and D registers must be same size.");
+  static_assert(kWRegSize == kSRegSize, "W and S registers must be same size.");
+  static_assert(sizeof(for_sizeof().q_[0]) == kQRegSize,
+                "Array elements must be size of Q register.");
+  static_assert(sizeof(for_sizeof().d_[0]) == kDRegSize,
+                "Array elements must be size of D register.");
+  static_assert(sizeof(for_sizeof().s_[0]) == kSRegSize,
+                "Array elements must be size of S register.");
+  static_assert(sizeof(for_sizeof().x_[0]) == kXRegSize,
+                "Array elements must be size of X register.");
+  static_assert(sizeof(for_sizeof().w_[0]) == kWRegSize,
+                "Array elements must be size of W register.");
 };

 // Some of these methods don't use the RegisterDump argument, but they have to
@ -183,12 +199,14 @@ bool Equal32(uint32_t expected, const RegisterDump* core, const Register& reg);
 bool Equal64(uint64_t expected, const RegisterDump* core, const Register& reg);

 bool EqualFP32(float expected, const RegisterDump* core,
-               const FPRegister& fpreg);
+               const VRegister& fpreg);
 bool EqualFP64(double expected, const RegisterDump* core,
-               const FPRegister& fpreg);
+               const VRegister& fpreg);

 bool Equal64(const Register& reg0, const RegisterDump* core,
             const Register& reg1);
+bool Equal128(uint64_t expected_h, uint64_t expected_l,
+              const RegisterDump* core, const VRegister& reg);

 bool EqualNzcv(uint32_t expected, uint32_t result);

@ -208,8 +226,8 @@ RegList PopulateRegisterArray(Register* w, Register* x, Register* r,
                              int reg_size, int reg_count, RegList allowed);

 // As PopulateRegisterArray, but for floating-point registers.
-RegList PopulateFPRegisterArray(FPRegister* s, FPRegister* d, FPRegister* v,
-                                int reg_size, int reg_count, RegList allowed);
+RegList PopulateVRegisterArray(VRegister* s, VRegister* d, VRegister* v,
+                               int reg_size, int reg_count, RegList allowed);

 // Ovewrite the contents of the specified registers. This enables tests to
 // check that register contents are written in cases where it's likely that the