[dict-proto] SIMD support for SwissNameDictionary in Torque
This CL adds a Torque-counterpart for swiss_table::GroupSse2Impl in Torque. This allows the Torque version of SwissNameDictionary to use SSE for lookups, rather than needing to bailout to the runtime on x64/ia32. Bug: v8:11330 Change-Id: I74e3f97c460a8b89031016967ec0e545265016a9 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2787485 Reviewed-by: Igor Sheludko <ishell@chromium.org> Reviewed-by: Santiago Aboy Solanes <solanes@chromium.org> Reviewed-by: Zhi An Ng <zhin@chromium.org> Commit-Queue: Igor Sheludko <ishell@chromium.org> Cr-Commit-Position: refs/heads/master@{#73727}
This commit is contained in:
parent
eff32ae88b
commit
856e8577e3
@ -113,6 +113,9 @@ type bool generates 'TNode<BoolT>' constexpr 'bool';
|
||||
type bint generates 'TNode<BInt>' constexpr 'BInt';
|
||||
type string constexpr 'const char*';
|
||||
|
||||
type Simd128 generates 'TNode<Simd128T>';
|
||||
type I8X16 extends Simd128 generates 'TNode<I8x16T>';
|
||||
|
||||
// Represents a std::function which produces the generated TNode type of T.
|
||||
// Useful for passing values to and from CSA code that uses LazyNode<T>, which
|
||||
// is a typedef for std::function<TNode<T>()>. Can be created with %MakeLazy and
|
||||
@ -917,7 +920,7 @@ extern operator '*' macro ConstexprInt31Mul(
|
||||
extern operator '-' macro Int32Sub(int16, int16): int32;
|
||||
extern operator '-' macro Int32Sub(uint16, uint16): int32;
|
||||
extern operator '-' macro Int32Sub(int32, int32): int32;
|
||||
extern operator '-' macro UInt32Sub(uint32, uint32): uint32;
|
||||
extern operator '-' macro Uint32Sub(uint32, uint32): uint32;
|
||||
extern operator '*' macro Int32Mul(int32, int32): int32;
|
||||
extern operator '*' macro Uint32Mul(uint32, uint32): uint32;
|
||||
extern operator '/' macro Int32Div(int32, int32): int32;
|
||||
@ -1050,6 +1053,7 @@ operator '==' macro PromiseStateEquals(
|
||||
}
|
||||
|
||||
extern macro CountLeadingZeros64(uint64): int64;
|
||||
extern macro CountTrailingZeros32(uint32): int32;
|
||||
extern macro CountTrailingZeros64(uint64): int64;
|
||||
|
||||
extern macro TaggedIsSmi(Object): bool;
|
||||
@ -1845,3 +1849,8 @@ extern operator '[]' macro LoadWeakFixedArrayElement(
|
||||
const kNoHashSentinel:
|
||||
constexpr int32 generates 'PropertyArray::kNoHashSentinel';
|
||||
extern macro LoadNameHash(Name): uint32;
|
||||
|
||||
extern macro LoadSimd128(intptr): Simd128;
|
||||
extern macro I8x16BitMask(I8X16): int32;
|
||||
extern macro I8x16Eq(I8X16, I8X16): I8X16;
|
||||
extern macro I8x16Splat(int32): I8X16;
|
||||
|
@ -335,3 +335,6 @@ Convert<PromiseState, int32>(s: int32): PromiseState {
|
||||
Convert<ScopeFlags, Smi>(s: Smi): ScopeFlags {
|
||||
return %RawDownCast<ScopeFlags>(Unsigned(SmiToInt32(s)));
|
||||
}
|
||||
Convert<I8X16, Simd128>(s: Simd128): I8X16 {
|
||||
return %RawDownCast<I8X16>(s);
|
||||
}
|
||||
|
@ -1137,6 +1137,10 @@ class V8_EXPORT_PRIVATE CodeStubAssembler
|
||||
Map::kConstructorOrBackPointerOrNativeContextOffset);
|
||||
}
|
||||
|
||||
TNode<Simd128T> LoadSimd128(TNode<IntPtrT> ptr) {
|
||||
return Load<Simd128T>(ptr);
|
||||
}
|
||||
|
||||
// Reference is the CSA-equivalent of a Torque reference value, representing
|
||||
// an inner pointer into a HeapObject.
|
||||
//
|
||||
|
@ -110,6 +110,16 @@ struct BoolT : Word32T {};
|
||||
template <class T1, class T2>
|
||||
struct PairT {};
|
||||
|
||||
struct Simd128T : UntaggedT {
|
||||
static const MachineRepresentation kMachineRepresentation =
|
||||
MachineRepresentation::kSimd128;
|
||||
static constexpr MachineType kMachineType = MachineType::Simd128();
|
||||
};
|
||||
|
||||
struct I8x16T : Simd128T {};
|
||||
struct I16x8T : Simd128T {};
|
||||
struct I32x2T : Simd128T {};
|
||||
|
||||
inline constexpr MachineType CommonMachineType(MachineType type1,
|
||||
MachineType type2) {
|
||||
return (type1 == type2) ? type1
|
||||
|
@ -270,6 +270,7 @@ class CodeAssemblerParameterizedLabel;
|
||||
V(Float64Min, Float64T, Float64T, Float64T) \
|
||||
V(Float64InsertLowWord32, Float64T, Float64T, Word32T) \
|
||||
V(Float64InsertHighWord32, Float64T, Float64T, Word32T) \
|
||||
V(I8x16Eq, I8x16T, I8x16T, I8x16T) \
|
||||
V(IntPtrAdd, WordT, WordT, WordT) \
|
||||
V(IntPtrSub, WordT, WordT, WordT) \
|
||||
V(IntPtrMul, WordT, WordT, WordT) \
|
||||
@ -374,6 +375,8 @@ TNode<Float64T> Float64Add(TNode<Float64T> a, TNode<Float64T> b);
|
||||
V(Word32BitwiseNot, Word32T, Word32T) \
|
||||
V(WordNot, WordT, WordT) \
|
||||
V(Word64Not, Word64T, Word64T) \
|
||||
V(I8x16BitMask, Int32T, I8x16T) \
|
||||
V(I8x16Splat, I8x16T, Int32T) \
|
||||
V(Int32AbsWithOverflow, PAIR_TYPE(Int32T, BoolT), Int32T) \
|
||||
V(Int64AbsWithOverflow, PAIR_TYPE(Int64T, BoolT), Int64T) \
|
||||
V(IntPtrAbsWithOverflow, PAIR_TYPE(IntPtrT, BoolT), IntPtrT) \
|
||||
|
@ -272,6 +272,7 @@ class MachineRepresentationInferrer {
|
||||
case IrOpcode::kFloat64ExtractLowWord32:
|
||||
case IrOpcode::kFloat64ExtractHighWord32:
|
||||
case IrOpcode::kWord32Popcnt:
|
||||
case IrOpcode::kI8x16BitMask:
|
||||
MACHINE_UNOP_32_LIST(LABEL)
|
||||
MACHINE_BINOP_32_LIST(LABEL) {
|
||||
representation_vector_[node->id()] =
|
||||
@ -323,6 +324,8 @@ class MachineRepresentationInferrer {
|
||||
break;
|
||||
case IrOpcode::kI32x4ReplaceLane:
|
||||
case IrOpcode::kI32x4Splat:
|
||||
case IrOpcode::kI8x16Splat:
|
||||
case IrOpcode::kI8x16Eq:
|
||||
representation_vector_[node->id()] =
|
||||
MachineRepresentation::kSimd128;
|
||||
break;
|
||||
@ -445,6 +448,7 @@ class MachineRepresentationChecker {
|
||||
case IrOpcode::kI32x4ExtractLane:
|
||||
case IrOpcode::kI16x8ExtractLaneU:
|
||||
case IrOpcode::kI16x8ExtractLaneS:
|
||||
case IrOpcode::kI8x16BitMask:
|
||||
case IrOpcode::kI8x16ExtractLaneU:
|
||||
case IrOpcode::kI8x16ExtractLaneS:
|
||||
CheckValueInputRepresentationIs(node, 0,
|
||||
@ -456,8 +460,16 @@ class MachineRepresentationChecker {
|
||||
CheckValueInputForInt32Op(node, 1);
|
||||
break;
|
||||
case IrOpcode::kI32x4Splat:
|
||||
case IrOpcode::kI8x16Splat:
|
||||
CheckValueInputForInt32Op(node, 0);
|
||||
break;
|
||||
case IrOpcode::kI8x16Eq:
|
||||
CheckValueInputRepresentationIs(node, 0,
|
||||
MachineRepresentation::kSimd128);
|
||||
CheckValueInputRepresentationIs(node, 1,
|
||||
MachineRepresentation::kSimd128);
|
||||
break;
|
||||
|
||||
#define LABEL(opcode) case IrOpcode::k##opcode:
|
||||
case IrOpcode::kChangeInt32ToTagged:
|
||||
case IrOpcode::kChangeUint32ToTagged:
|
||||
|
@ -881,6 +881,12 @@ class V8_EXPORT_PRIVATE RawMachineAssembler {
|
||||
Node* I16x8Splat(Node* a) { return AddNode(machine()->I16x8Splat(), a); }
|
||||
Node* I8x16Splat(Node* a) { return AddNode(machine()->I8x16Splat(), a); }
|
||||
|
||||
Node* I8x16BitMask(Node* a) { return AddNode(machine()->I8x16BitMask(), a); }
|
||||
|
||||
Node* I8x16Eq(Node* a, Node* b) {
|
||||
return AddNode(machine()->I8x16Eq(), a, b);
|
||||
}
|
||||
|
||||
// Stack operations.
|
||||
Node* LoadFramePointer() { return AddNode(machine()->LoadFramePointer()); }
|
||||
Node* LoadParentFramePointer() {
|
||||
|
@ -349,11 +349,29 @@ struct GroupPortableImpl {
|
||||
};
|
||||
|
||||
// Determine which Group implementation SwissNameDictionary uses.
|
||||
#if defined(V8_ENABLE_SWISS_NAME_DICTIONARY) && DEBUG
|
||||
// TODO(v8:11388) If v8_enable_swiss_name_dictionary is enabled, we are supposed
|
||||
// to use SwissNameDictionary as the dictionary backing store. If we want to use
|
||||
// the SIMD version of SwissNameDictionary, that would require us to compile SSE
|
||||
// instructions into the snapshot that exceed the minimum requirements for V8
|
||||
// SSE support. Therefore, this fails a DCHECK. However, given the experimental
|
||||
// nature of v8_enable_swiss_name_dictionary mode, we only except this to be run
|
||||
// by developers/bots, that always have the necessary instructions. This means
|
||||
// that if v8_enable_swiss_name_dictionary is enabled and debug mode isn't, we
|
||||
// ignore the DCHECK that would fail in debug mode. However, if both
|
||||
// v8_enable_swiss_name_dictionary and debug mode are enabled, we must fallback
|
||||
// to the non-SSE implementation. Given that V8 requires SSE2, there should be a
|
||||
// solution that doesn't require the workaround present here. Instead, the
|
||||
// backend should only use SSE2 when compiling the SIMD version of
|
||||
// SwissNameDictionary into the builtin.
|
||||
using Group = GroupPortableImpl;
|
||||
#else
|
||||
#if SWISS_TABLE_HAVE_SSE2
|
||||
using Group = GroupSse2Impl;
|
||||
#else
|
||||
using Group = GroupPortableImpl;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#undef SWISS_TABLE_HAVE_SSE2
|
||||
#undef SWISS_TABLE_HAVE_SSE3
|
||||
|
@ -45,6 +45,10 @@ struct ProbeSequence {
|
||||
index: uint32;
|
||||
}
|
||||
|
||||
macro ClearLowestSetBit<T: type>(value: T): T {
|
||||
return value & (value - FromConstexpr<T>(1));
|
||||
}
|
||||
|
||||
const kByteMaskShift: uint64 = 3;
|
||||
|
||||
// Counterpart to swiss_table::BitMask<uint64_t, kWidth, 3>, as used by
|
||||
@ -61,12 +65,31 @@ struct ByteMask {
|
||||
|
||||
// Counterpart to operator++() in C++ version.
|
||||
macro ClearLowestSetBit() {
|
||||
this.mask = this.mask & (this.mask - FromConstexpr<uint64>(1));
|
||||
this.mask = ClearLowestSetBit<uint64>(this.mask);
|
||||
}
|
||||
|
||||
mask: uint64;
|
||||
}
|
||||
|
||||
// Counterpart to swiss_table::BitMask<uint32t, kWidth, 0>, as used by
|
||||
// swiss_table::GroupSse2Impl in C++ implementation.
|
||||
struct BitMask {
|
||||
macro HasBitsSet(): bool {
|
||||
return this.mask != FromConstexpr<uint32>(0);
|
||||
}
|
||||
|
||||
macro LowestBitSet(): int32 {
|
||||
return Convert<int32>(CountTrailingZeros32(this.mask));
|
||||
}
|
||||
|
||||
// Counterpart to operator++() in C++ version.
|
||||
macro ClearLowestSetBit() {
|
||||
this.mask = ClearLowestSetBit<uint32>(this.mask);
|
||||
}
|
||||
|
||||
mask: uint32;
|
||||
}
|
||||
|
||||
macro H1(hash: uint32): uint32 {
|
||||
return hash >>> Unsigned(FromConstexpr<int32>(kH2Bits));
|
||||
}
|
||||
@ -80,6 +103,7 @@ const kLsbs: constexpr uint64
|
||||
const kMsbs: constexpr uint64
|
||||
generates 'swiss_table::GroupPortableImpl::kMsbs';
|
||||
|
||||
// Counterpart to swiss_table::GroupPortableImpl in C++.
|
||||
struct GroupPortableImpl {
|
||||
macro Match(h2: uint32): ByteMask {
|
||||
const x = Word64Xor(this.ctrl, (kLsbs * Convert<uint64>(h2)));
|
||||
@ -95,6 +119,45 @@ struct GroupPortableImpl {
|
||||
const ctrl: uint64;
|
||||
}
|
||||
|
||||
// Counterpart to swiss_table::GroupSse2Impl in C++. Note that the name is
|
||||
// chosen for consistency, this struct is not actually SSE-specific.
|
||||
struct GroupSse2Impl {
|
||||
macro Match(h2: uint32): BitMask {
|
||||
// Fill 16 8-bit lanes with |h2|:
|
||||
const searchPattern = I8x16Splat(Signed(h2));
|
||||
// Create a 128 bit mask such that in each of the 16 8-bit lanes, the MSB
|
||||
// indicates whether or not the corresponding lanes of |this.ctrl| and
|
||||
// |searchPattern| have the same value:
|
||||
const matches128 = I8x16Eq(searchPattern, this.ctrl);
|
||||
// Turn the 128 bit mask into a 32 bit one, by turning the MSB of the i-th
|
||||
// lane into the i-th bit in the output mask:
|
||||
const matches32 = Unsigned(I8x16BitMask(matches128));
|
||||
return BitMask{mask: matches32};
|
||||
}
|
||||
|
||||
macro MatchEmpty(): BitMask {
|
||||
// TODO(v8:11330) The C++ implementation in
|
||||
// swiss_table::GroupSse2Impl::MatchEmpty utilizes a special trick that is
|
||||
// possible due to kEmpty being -128 and allows shaving off one SSE
|
||||
// instruction. This depends on having access to _mm_cmpeq_epi8 aka PCMPEQB,
|
||||
// which the V8 backend currently doesn't expose.
|
||||
|
||||
// Fill 16 8-bit lanes with |kEmpty|:
|
||||
const searchPattern =
|
||||
I8x16Splat(Convert<int32>(FromConstexpr<uint8>(ctrl::kEmpty)));
|
||||
// Create a 128 bit mask such that in each of the 16 8-bit lanes, the MSB
|
||||
// indicates whether or not the corresponding lanes of |this.ctrl| contains
|
||||
// |kEmpty|:
|
||||
const matches128 = I8x16Eq(searchPattern, this.ctrl);
|
||||
// Turn the 128 bit mask into a 32 bit one, by turning the MSB of the i-th
|
||||
// lane into the i-th bit in the output mask:
|
||||
const matches32 = Unsigned(I8x16BitMask(matches128));
|
||||
return BitMask{mask: matches32};
|
||||
}
|
||||
|
||||
const ctrl: I8X16;
|
||||
}
|
||||
|
||||
struct GroupPortableLoader {
|
||||
macro LoadGroup(ctrlPtr: intptr): GroupPortableImpl {
|
||||
return GroupPortableImpl{
|
||||
@ -102,4 +165,10 @@ struct GroupPortableLoader {
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
struct GroupSse2Loader {
|
||||
macro LoadGroup(ctrlPtr: intptr): GroupSse2Impl {
|
||||
return GroupSse2Impl{ctrl: Convert<I8X16>(LoadSimd128(ctrlPtr))};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -28,12 +28,6 @@ const kMax2ByteMetaTableCapacity: constexpr int32
|
||||
const kNotFoundSentinel:
|
||||
constexpr int32 generates 'SwissNameDictionary::kNotFoundSentinel';
|
||||
|
||||
extern macro LoadSwissNameDictionaryNumberOfElements(
|
||||
SwissNameDictionary, intptr): intptr;
|
||||
|
||||
extern macro LoadSwissNameDictionaryNumberOfDeletedElements(
|
||||
SwissNameDictionary, intptr): intptr;
|
||||
|
||||
extern macro LoadSwissNameDictionaryKey(SwissNameDictionary, intptr): Name;
|
||||
|
||||
extern macro StoreSwissNameDictionaryKeyAndValue(
|
||||
@ -287,14 +281,8 @@ macro SwissNameDictionaryDelete(table: SwissNameDictionary, entry: intptr)
|
||||
@export
|
||||
macro SwissNameDictionaryFindEntrySIMD(table: SwissNameDictionary, key: Name):
|
||||
never labels Found(intptr), NotFound {
|
||||
// TODO(v8:11330) Not implemented in Torque, yet, doing runtime call
|
||||
// instead.
|
||||
const res = runtime::SwissTableFindEntry(kNoContext, table, key);
|
||||
if (res == kNotFoundSentinel) {
|
||||
goto NotFound;
|
||||
} else {
|
||||
goto Found(Convert<intptr>(res));
|
||||
}
|
||||
FindEntry<GroupSse2Loader>(table, key)
|
||||
otherwise Found, NotFound;
|
||||
}
|
||||
|
||||
@export
|
||||
@ -317,26 +305,8 @@ Found(intptr),
|
||||
macro SwissNameDictionaryAddSIMD(
|
||||
table: SwissNameDictionary, key: Name, value: Object,
|
||||
propertyDetails: uint8) labels Bailout {
|
||||
// TODO(v8:11330) Not implemented in Torque, yet, doing runtime call
|
||||
// instead. However, must bailout if the runtime call would allocate a new
|
||||
// dictionary.
|
||||
|
||||
// Determine if bailout needed:
|
||||
const capacity = Convert<intptr>(table.capacity);
|
||||
const maxUsable = SwissNameDictionaryMaxUsableCapacity(capacity);
|
||||
// Doing two independent accesses to the meta table here (repeating the
|
||||
// branching), rather than combining the accesses. Accepting that due to
|
||||
// the fact that this is a slow placeholder until the SIMD version
|
||||
// replaces it.
|
||||
const nof = LoadSwissNameDictionaryNumberOfElements(table, capacity);
|
||||
const nod = LoadSwissNameDictionaryNumberOfDeletedElements(table, capacity);
|
||||
const used = nof + nod;
|
||||
if (used >= maxUsable) {
|
||||
goto Bailout;
|
||||
}
|
||||
runtime::SwissTableAdd(
|
||||
kNoContext, table, key, value,
|
||||
Convert<Smi>(Convert<int32>(propertyDetails)));
|
||||
Add<GroupSse2Loader>(table, key, value, propertyDetails)
|
||||
otherwise Bailout;
|
||||
}
|
||||
|
||||
@export
|
||||
|
Loading…
Reference in New Issue
Block a user