[dict-proto] SIMD support for SwissNameDictionary in Torque

This CL adds a Torque-counterpart for swiss_table::GroupSse2Impl in Torque. This allows the Torque version of SwissNameDictionary to use SSE for lookups, rather than needing to bailout to the runtime on x64/ia32. Bug: v8:11330 Change-Id: I74e3f97c460a8b89031016967ec0e545265016a9 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2787485 Reviewed-by: Igor Sheludko <ishell@chromium.org> Reviewed-by: Santiago Aboy Solanes <solanes@chromium.org> Reviewed-by: Zhi An Ng <zhin@chromium.org> Commit-Queue: Igor Sheludko <ishell@chromium.org> Cr-Commit-Position: refs/heads/master@{#73727}
2021-03-30 18:53:09 +02:00 · 2021-03-30 18:53:09 +02:00 · 856e8577e3
commit 856e8577e3
parent eff32ae88b
10 changed files with 140 additions and 36 deletions
--- a/src/builtins/base.tq
+++ b/src/builtins/base.tq
@ -113,6 +113,9 @@ type bool generates 'TNode<BoolT>' constexpr 'bool';
 type bint generates 'TNode<BInt>' constexpr 'BInt';
 type string constexpr 'const char*';

+type Simd128 generates 'TNode<Simd128T>';
+type I8X16 extends Simd128 generates 'TNode<I8x16T>';
+
 // Represents a std::function which produces the generated TNode type of T.
 // Useful for passing values to and from CSA code that uses LazyNode<T>, which
 // is a typedef for std::function<TNode<T>()>. Can be created with %MakeLazy and
@ -917,7 +920,7 @@ extern operator '*' macro ConstexprInt31Mul(
 extern operator '-' macro Int32Sub(int16, int16): int32;
 extern operator '-' macro Int32Sub(uint16, uint16): int32;
 extern operator '-' macro Int32Sub(int32, int32): int32;
-extern operator '-' macro UInt32Sub(uint32, uint32): uint32;
+extern operator '-' macro Uint32Sub(uint32, uint32): uint32;
 extern operator '*' macro Int32Mul(int32, int32): int32;
 extern operator '*' macro Uint32Mul(uint32, uint32): uint32;
 extern operator '/' macro Int32Div(int32, int32): int32;
@ -1050,6 +1053,7 @@ operator '==' macro PromiseStateEquals(
 }

 extern macro CountLeadingZeros64(uint64): int64;
+extern macro CountTrailingZeros32(uint32): int32;
 extern macro CountTrailingZeros64(uint64): int64;

 extern macro TaggedIsSmi(Object): bool;
@ -1845,3 +1849,8 @@ extern operator '[]' macro LoadWeakFixedArrayElement(
 const kNoHashSentinel:
    constexpr int32 generates 'PropertyArray::kNoHashSentinel';
 extern macro LoadNameHash(Name): uint32;
+
+extern macro LoadSimd128(intptr): Simd128;
+extern macro I8x16BitMask(I8X16): int32;
+extern macro I8x16Eq(I8X16, I8X16): I8X16;
+extern macro I8x16Splat(int32): I8X16;
--- a/src/builtins/convert.tq
+++ b/src/builtins/convert.tq
@ -335,3 +335,6 @@ Convert<PromiseState, int32>(s: int32): PromiseState {
 Convert<ScopeFlags, Smi>(s: Smi): ScopeFlags {
  return %RawDownCast<ScopeFlags>(Unsigned(SmiToInt32(s)));
 }
+Convert<I8X16, Simd128>(s: Simd128): I8X16 {
+  return %RawDownCast<I8X16>(s);
+}
--- a/src/codegen/code-stub-assembler.h
+++ b/src/codegen/code-stub-assembler.h
@ -1137,6 +1137,10 @@ class V8_EXPORT_PRIVATE CodeStubAssembler
                           Map::kConstructorOrBackPointerOrNativeContextOffset);
  }

+  TNode<Simd128T> LoadSimd128(TNode<IntPtrT> ptr) {
+    return Load<Simd128T>(ptr);
+  }
+
  // Reference is the CSA-equivalent of a Torque reference value, representing
  // an inner pointer into a HeapObject.
  //
--- a/src/codegen/tnode.h
+++ b/src/codegen/tnode.h
@ -110,6 +110,16 @@ struct BoolT : Word32T {};
 template <class T1, class T2>
 struct PairT {};

+struct Simd128T : UntaggedT {
+  static const MachineRepresentation kMachineRepresentation =
+      MachineRepresentation::kSimd128;
+  static constexpr MachineType kMachineType = MachineType::Simd128();
+};
+
+struct I8x16T : Simd128T {};
+struct I16x8T : Simd128T {};
+struct I32x2T : Simd128T {};
+
 inline constexpr MachineType CommonMachineType(MachineType type1,
                                               MachineType type2) {
  return (type1 == type2) ? type1
--- a/src/compiler/code-assembler.h
+++ b/src/compiler/code-assembler.h
@ -270,6 +270,7 @@ class CodeAssemblerParameterizedLabel;
  V(Float64Min, Float64T, Float64T, Float64T)                           \
  V(Float64InsertLowWord32, Float64T, Float64T, Word32T)                \
  V(Float64InsertHighWord32, Float64T, Float64T, Word32T)               \
+  V(I8x16Eq, I8x16T, I8x16T, I8x16T)                                    \
  V(IntPtrAdd, WordT, WordT, WordT)                                     \
  V(IntPtrSub, WordT, WordT, WordT)                                     \
  V(IntPtrMul, WordT, WordT, WordT)                                     \
@ -374,6 +375,8 @@ TNode<Float64T> Float64Add(TNode<Float64T> a, TNode<Float64T> b);
  V(Word32BitwiseNot, Word32T, Word32T)                        \
  V(WordNot, WordT, WordT)                                     \
  V(Word64Not, Word64T, Word64T)                               \
+  V(I8x16BitMask, Int32T, I8x16T)                              \
+  V(I8x16Splat, I8x16T, Int32T)                                \
  V(Int32AbsWithOverflow, PAIR_TYPE(Int32T, BoolT), Int32T)    \
  V(Int64AbsWithOverflow, PAIR_TYPE(Int64T, BoolT), Int64T)    \
  V(IntPtrAbsWithOverflow, PAIR_TYPE(IntPtrT, BoolT), IntPtrT) \
--- a/src/compiler/machine-graph-verifier.cc
+++ b/src/compiler/machine-graph-verifier.cc
@ -272,6 +272,7 @@ class MachineRepresentationInferrer {
          case IrOpcode::kFloat64ExtractLowWord32:
          case IrOpcode::kFloat64ExtractHighWord32:
          case IrOpcode::kWord32Popcnt:
+          case IrOpcode::kI8x16BitMask:
            MACHINE_UNOP_32_LIST(LABEL)
            MACHINE_BINOP_32_LIST(LABEL) {
              representation_vector_[node->id()] =
@ -323,6 +324,8 @@ class MachineRepresentationInferrer {
            break;
          case IrOpcode::kI32x4ReplaceLane:
          case IrOpcode::kI32x4Splat:
+          case IrOpcode::kI8x16Splat:
+          case IrOpcode::kI8x16Eq:
            representation_vector_[node->id()] =
                MachineRepresentation::kSimd128;
            break;
@ -445,6 +448,7 @@ class MachineRepresentationChecker {
          case IrOpcode::kI32x4ExtractLane:
          case IrOpcode::kI16x8ExtractLaneU:
          case IrOpcode::kI16x8ExtractLaneS:
+          case IrOpcode::kI8x16BitMask:
          case IrOpcode::kI8x16ExtractLaneU:
          case IrOpcode::kI8x16ExtractLaneS:
            CheckValueInputRepresentationIs(node, 0,
@ -456,8 +460,16 @@ class MachineRepresentationChecker {
            CheckValueInputForInt32Op(node, 1);
            break;
          case IrOpcode::kI32x4Splat:
+          case IrOpcode::kI8x16Splat:
            CheckValueInputForInt32Op(node, 0);
            break;
+          case IrOpcode::kI8x16Eq:
+            CheckValueInputRepresentationIs(node, 0,
+                                            MachineRepresentation::kSimd128);
+            CheckValueInputRepresentationIs(node, 1,
+                                            MachineRepresentation::kSimd128);
+            break;
+
 #define LABEL(opcode) case IrOpcode::k##opcode:
          case IrOpcode::kChangeInt32ToTagged:
          case IrOpcode::kChangeUint32ToTagged:
--- a/src/compiler/raw-machine-assembler.h
+++ b/src/compiler/raw-machine-assembler.h
@ -881,6 +881,12 @@ class V8_EXPORT_PRIVATE RawMachineAssembler {
  Node* I16x8Splat(Node* a) { return AddNode(machine()->I16x8Splat(), a); }
  Node* I8x16Splat(Node* a) { return AddNode(machine()->I8x16Splat(), a); }

+  Node* I8x16BitMask(Node* a) { return AddNode(machine()->I8x16BitMask(), a); }
+
+  Node* I8x16Eq(Node* a, Node* b) {
+    return AddNode(machine()->I8x16Eq(), a, b);
+  }
+
  // Stack operations.
  Node* LoadFramePointer() { return AddNode(machine()->LoadFramePointer()); }
  Node* LoadParentFramePointer() {
--- a/src/objects/swiss-hash-table-helpers.h
+++ b/src/objects/swiss-hash-table-helpers.h
@ -349,11 +349,29 @@ struct GroupPortableImpl {
 };

 // Determine which Group implementation SwissNameDictionary uses.
+#if defined(V8_ENABLE_SWISS_NAME_DICTIONARY) && DEBUG
+// TODO(v8:11388) If v8_enable_swiss_name_dictionary is enabled, we are supposed
+// to use SwissNameDictionary as the dictionary backing store. If we want to use
+// the SIMD version of SwissNameDictionary, that would require us to compile SSE
+// instructions into the snapshot that exceed the minimum requirements for V8
+// SSE support. Therefore, this fails a DCHECK. However, given the experimental
+// nature of v8_enable_swiss_name_dictionary mode, we only except this to be run
+// by developers/bots, that always have the necessary instructions. This means
+// that if v8_enable_swiss_name_dictionary is enabled and debug mode isn't, we
+// ignore the DCHECK that would fail in debug mode. However, if both
+// v8_enable_swiss_name_dictionary and debug mode are enabled, we must fallback
+// to the non-SSE implementation. Given that V8 requires SSE2, there should be a
+// solution that doesn't require the workaround present here. Instead, the
+// backend should only use SSE2 when compiling the SIMD version of
+// SwissNameDictionary into the builtin.
+using Group = GroupPortableImpl;
+#else
 #if SWISS_TABLE_HAVE_SSE2
 using Group = GroupSse2Impl;
 #else
 using Group = GroupPortableImpl;
 #endif
+#endif

 #undef SWISS_TABLE_HAVE_SSE2
 #undef SWISS_TABLE_HAVE_SSE3
--- a/src/objects/swiss-hash-table-helpers.tq
+++ b/src/objects/swiss-hash-table-helpers.tq
@ -45,6 +45,10 @@ struct ProbeSequence {
  index: uint32;
 }

+macro ClearLowestSetBit<T: type>(value: T): T {
+  return value & (value - FromConstexpr<T>(1));
+}
+
 const kByteMaskShift: uint64 = 3;

 // Counterpart to swiss_table::BitMask<uint64_t, kWidth, 3>, as used by
@ -61,12 +65,31 @@ struct ByteMask {

  // Counterpart to operator++() in C++ version.
  macro ClearLowestSetBit() {
-    this.mask = this.mask & (this.mask - FromConstexpr<uint64>(1));
+    this.mask = ClearLowestSetBit<uint64>(this.mask);
  }

  mask: uint64;
 }

+// Counterpart to swiss_table::BitMask<uint32t, kWidth, 0>, as used by
+// swiss_table::GroupSse2Impl in C++ implementation.
+struct BitMask {
+  macro HasBitsSet(): bool {
+    return this.mask != FromConstexpr<uint32>(0);
+  }
+
+  macro LowestBitSet(): int32 {
+    return Convert<int32>(CountTrailingZeros32(this.mask));
+  }
+
+  // Counterpart to operator++() in C++ version.
+  macro ClearLowestSetBit() {
+    this.mask = ClearLowestSetBit<uint32>(this.mask);
+  }
+
+  mask: uint32;
+}
+
 macro H1(hash: uint32): uint32 {
  return hash >>> Unsigned(FromConstexpr<int32>(kH2Bits));
 }
@ -80,6 +103,7 @@ const kLsbs: constexpr uint64
 const kMsbs: constexpr uint64
    generates 'swiss_table::GroupPortableImpl::kMsbs';

+// Counterpart to swiss_table::GroupPortableImpl in C++.
 struct GroupPortableImpl {
  macro Match(h2: uint32): ByteMask {
    const x = Word64Xor(this.ctrl, (kLsbs * Convert<uint64>(h2)));
@ -95,6 +119,45 @@ struct GroupPortableImpl {
  const ctrl: uint64;
 }

+// Counterpart to swiss_table::GroupSse2Impl in C++. Note that the name is
+// chosen for consistency, this struct is not actually SSE-specific.
+struct GroupSse2Impl {
+  macro Match(h2: uint32): BitMask {
+    // Fill 16 8-bit lanes with |h2|:
+    const searchPattern = I8x16Splat(Signed(h2));
+    // Create a 128 bit mask such that in each of the 16 8-bit lanes, the MSB
+    // indicates whether or not the corresponding lanes of |this.ctrl| and
+    // |searchPattern| have the same value:
+    const matches128 = I8x16Eq(searchPattern, this.ctrl);
+    // Turn the 128 bit mask into a 32 bit one, by turning the MSB of the i-th
+    // lane into the i-th bit in the output mask:
+    const matches32 = Unsigned(I8x16BitMask(matches128));
+    return BitMask{mask: matches32};
+  }
+
+  macro MatchEmpty(): BitMask {
+    // TODO(v8:11330) The C++ implementation in
+    // swiss_table::GroupSse2Impl::MatchEmpty utilizes a special trick that is
+    // possible due to kEmpty being -128 and allows shaving off one SSE
+    // instruction. This depends on having access to _mm_cmpeq_epi8 aka PCMPEQB,
+    // which the V8 backend currently doesn't expose.
+
+    // Fill 16 8-bit lanes with |kEmpty|:
+    const searchPattern =
+        I8x16Splat(Convert<int32>(FromConstexpr<uint8>(ctrl::kEmpty)));
+    // Create a 128 bit mask such that in each of the 16 8-bit lanes, the MSB
+    // indicates whether or not the corresponding lanes of |this.ctrl| contains
+    // |kEmpty|:
+    const matches128 = I8x16Eq(searchPattern, this.ctrl);
+    // Turn the 128 bit mask into a 32 bit one, by turning the MSB of the i-th
+    // lane into the i-th bit in the output mask:
+    const matches32 = Unsigned(I8x16BitMask(matches128));
+    return BitMask{mask: matches32};
+  }
+
+  const ctrl: I8X16;
+}
+
 struct GroupPortableLoader {
  macro LoadGroup(ctrlPtr: intptr): GroupPortableImpl {
    return GroupPortableImpl{
@ -102,4 +165,10 @@ struct GroupPortableLoader {
    };
  }
 }
+
+struct GroupSse2Loader {
+  macro LoadGroup(ctrlPtr: intptr): GroupSse2Impl {
+    return GroupSse2Impl{ctrl: Convert<I8X16>(LoadSimd128(ctrlPtr))};
+  }
+}
 }
--- a/src/objects/swiss-name-dictionary.tq
+++ b/src/objects/swiss-name-dictionary.tq
@ -28,12 +28,6 @@ const kMax2ByteMetaTableCapacity: constexpr int32
 const kNotFoundSentinel:
    constexpr int32 generates 'SwissNameDictionary::kNotFoundSentinel';

-extern macro LoadSwissNameDictionaryNumberOfElements(
-    SwissNameDictionary, intptr): intptr;
-
-extern macro LoadSwissNameDictionaryNumberOfDeletedElements(
-    SwissNameDictionary, intptr): intptr;
-
 extern macro LoadSwissNameDictionaryKey(SwissNameDictionary, intptr): Name;

 extern macro StoreSwissNameDictionaryKeyAndValue(
@ -287,14 +281,8 @@ macro SwissNameDictionaryDelete(table: SwissNameDictionary, entry: intptr)
@export
 macro SwissNameDictionaryFindEntrySIMD(table: SwissNameDictionary, key: Name):
    never labels Found(intptr), NotFound {
-  // TODO(v8:11330) Not implemented in Torque, yet, doing runtime call
-  // instead.
-  const res = runtime::SwissTableFindEntry(kNoContext, table, key);
-  if (res == kNotFoundSentinel) {
-    goto NotFound;
-  } else {
-    goto Found(Convert<intptr>(res));
-  }
+  FindEntry<GroupSse2Loader>(table, key)
+      otherwise Found, NotFound;
 }

@export
@ -317,26 +305,8 @@ Found(intptr),
 macro SwissNameDictionaryAddSIMD(
    table: SwissNameDictionary, key: Name, value: Object,
    propertyDetails: uint8) labels Bailout {
-  // TODO(v8:11330) Not implemented in Torque, yet, doing runtime call
-  // instead. However, must bailout if the runtime call would allocate a new
-  // dictionary.
-
-  // Determine if bailout needed:
-  const capacity = Convert<intptr>(table.capacity);
-  const maxUsable = SwissNameDictionaryMaxUsableCapacity(capacity);
-  // Doing two independent accesses to the meta table here (repeating the
-  // branching), rather than combining the accesses. Accepting that due to
-  // the fact that this is a slow placeholder until the SIMD version
-  // replaces it.
-  const nof = LoadSwissNameDictionaryNumberOfElements(table, capacity);
-  const nod = LoadSwissNameDictionaryNumberOfDeletedElements(table, capacity);
-  const used = nof + nod;
-  if (used >= maxUsable) {
-    goto Bailout;
-  }
-  runtime::SwissTableAdd(
-      kNoContext, table, key, value,
-      Convert<Smi>(Convert<int32>(propertyDetails)));
+  Add<GroupSse2Loader>(table, key, value, propertyDetails)
+      otherwise Bailout;
 }

@export