[turbofan] Clean up notion of a Code Point in operators and builtins

Various TurboFan operators and builtins currently conflate the concept of a Unicode code point with that of its UTF-32 and UTF-16 encoding. UTF-16 is only used as an internal optimization, and should not be exposed. This CL separates affected operators and clarifies the naming of various internals.

Prior to this CL, StringCodePointAt operator's typing rule was unsound, since it was not strictly limited to returning values in the range of Unicode code points.

R=sigurds@chromium.org

Bug: v8:9413
Change-Id: Icd32fdbeceadbc74e9761b717a27e3ff9ecde1f2
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1683998
Commit-Queue: Sigurd Schneider <sigurds@chromium.org>
Reviewed-by: Benedikt Meurer <bmeurer@chromium.org>
Reviewed-by: Sigurd Schneider <sigurds@chromium.org>
Cr-Commit-Position: refs/heads/master@{#62490}
This commit is contained in:
Sigurd Schneider 2019-07-02 15:37:44 +02:00 committed by Commit Bot
parent 5fe29649cf
commit a7eb133156
16 changed files with 109 additions and 146 deletions

View File

@ -103,8 +103,8 @@ namespace internal {
\
/* String helpers */ \
TFC(StringCharAt, StringAt) \
TFC(StringCodePointAtUTF16, StringAt) \
TFC(StringCodePointAtUTF32, StringAt) \
TFC(StringCodePointAt, StringAt) \
TFC(StringFromCodePointAt, StringAtAsString) \
TFC(StringEqual, Compare) \
TFC(StringGreaterThan, Compare) \
TFC(StringGreaterThanOrEqual, Compare) \

View File

@ -545,21 +545,7 @@ TF_BUILTIN(StringCharAt, StringBuiltinsAssembler) {
Return(result);
}
TF_BUILTIN(StringCodePointAtUTF16, StringBuiltinsAssembler) {
Node* receiver = Parameter(Descriptor::kReceiver);
Node* position = Parameter(Descriptor::kPosition);
// TODO(sigurds) Figure out if passing length as argument pays off.
TNode<IntPtrT> length = LoadStringLengthAsWord(receiver);
// Load the character code at the {position} from the {receiver}.
TNode<Int32T> code =
LoadSurrogatePairAt(receiver, length, position, UnicodeEncoding::UTF16);
// And return it as TaggedSigned value.
// TODO(turbofan): Allow builtins to return values untagged.
TNode<Smi> result = SmiFromInt32(code);
Return(result);
}
TF_BUILTIN(StringCodePointAtUTF32, StringBuiltinsAssembler) {
TF_BUILTIN(StringCodePointAt, StringBuiltinsAssembler) {
Node* receiver = Parameter(Descriptor::kReceiver);
Node* position = Parameter(Descriptor::kPosition);
@ -574,6 +560,21 @@ TF_BUILTIN(StringCodePointAtUTF32, StringBuiltinsAssembler) {
Return(result);
}
TF_BUILTIN(StringFromCodePointAt, StringBuiltinsAssembler) {
TNode<String> receiver = CAST(Parameter(Descriptor::kReceiver));
TNode<IntPtrT> position =
UncheckedCast<IntPtrT>(Parameter(Descriptor::kPosition));
// TODO(sigurds) Figure out if passing length as argument pays off.
TNode<IntPtrT> length = LoadStringLengthAsWord(receiver);
// Load the character code at the {position} from the {receiver}.
TNode<Int32T> code =
LoadSurrogatePairAt(receiver, length, position, UnicodeEncoding::UTF16);
// Create a String from the UTF16 encoded code point
TNode<String> result = StringFromSingleUTF16EncodedCodePoint(code);
Return(result);
}
// -----------------------------------------------------------------------------
// ES6 section 21.1 String Objects

View File

@ -39,7 +39,7 @@ namespace string_iterator {
// Move to next codepoint.
const encoding = UTF16;
const ch = string::LoadSurrogatePairAt(string, length, position, encoding);
const value: String = string::StringFromSingleCodePoint(ch, encoding);
const value: String = string::StringFromSingleUTF16EncodedCodePoint(ch);
iterator.next_index = SmiTag(position + value.length_intptr);
return AllocateJSIteratorResult(value, False);
}

View File

@ -21,8 +21,7 @@ namespace string {
extern macro StringBuiltinsAssembler::LoadSurrogatePairAt(
String, intptr, intptr, constexpr UnicodeEncoding): int32;
extern macro StringFromSingleCodePoint(int32, constexpr UnicodeEncoding):
String;
extern macro StringFromSingleUTF16EncodedCodePoint(int32): String;
// This function assumes StringPrimitiveWithNoCustomIteration is true.
transitioning builtin StringToList(implicit context: Context)(string: String):
@ -40,7 +39,7 @@ namespace string {
let i: intptr = 0;
while (i < stringLength) {
const ch: int32 = LoadSurrogatePairAt(string, stringLength, i, encoding);
const value: String = StringFromSingleCodePoint(ch, encoding);
const value: String = StringFromSingleUTF16EncodedCodePoint(ch);
elements[arrayLength] = value;
// Increment and continue the loop.
i = i + value.length_intptr;
@ -54,9 +53,9 @@ namespace string {
}
transitioning macro GenerateStringAt(implicit context: Context)(
receiver: Object, position: Object, methodName: constexpr string):
never labels IfInBounds(String, intptr, intptr),
IfOutOfBounds {
receiver: Object, position: Object,
methodName: constexpr string): never labels
IfInBounds(String, intptr, intptr), IfOutOfBounds {
// Check that {receiver} is coercible to Object and convert it to a String.
const string: String = ToThisString(receiver, methodName);
// Convert the {position} to a Smi and check that it's in bounds of

View File

@ -7481,8 +7481,8 @@ TNode<String> CodeStubAssembler::StringAdd(Node* context, TNode<String> left,
return result.value();
}
TNode<String> CodeStubAssembler::StringFromSingleCodePoint(
TNode<Int32T> codepoint, UnicodeEncoding encoding) {
TNode<String> CodeStubAssembler::StringFromSingleUTF16EncodedCodePoint(
TNode<Int32T> codepoint) {
VARIABLE(var_result, MachineRepresentation::kTagged, EmptyStringConstant());
Label if_isword16(this), if_isword32(this), return_result(this);
@ -7498,27 +7498,6 @@ TNode<String> CodeStubAssembler::StringFromSingleCodePoint(
BIND(&if_isword32);
{
switch (encoding) {
case UnicodeEncoding::UTF16:
break;
case UnicodeEncoding::UTF32: {
// Convert UTF32 to UTF16 code units, and store as a 32 bit word.
Node* lead_offset = Int32Constant(0xD800 - (0x10000 >> 10));
// lead = (codepoint >> 10) + LEAD_OFFSET
Node* lead =
Int32Add(Word32Shr(codepoint, Int32Constant(10)), lead_offset);
// trail = (codepoint & 0x3FF) + 0xDC00;
Node* trail = Int32Add(Word32And(codepoint, Int32Constant(0x3FF)),
Int32Constant(0xDC00));
// codpoint = (trail << 16) | lead;
codepoint = Signed(Word32Or(Word32Shl(trail, Int32Constant(16)), lead));
break;
}
}
Node* value = AllocateSeqTwoByteString(2);
StoreNoWriteBarrier(
MachineRepresentation::kWord32, value,

View File

@ -2462,8 +2462,7 @@ class V8_EXPORT_PRIVATE CodeStubAssembler
Node* DerefIndirectString(TNode<String> string, TNode<Int32T> instance_type,
Label* cannot_deref);
TNode<String> StringFromSingleCodePoint(TNode<Int32T> codepoint,
UnicodeEncoding encoding);
TNode<String> StringFromSingleUTF16EncodedCodePoint(TNode<Int32T> codepoint);
// Type conversion helpers.
enum class BigIntHandling { kConvertToNumber, kThrow };

View File

@ -252,6 +252,11 @@ void StringAtDescriptor::InitializePlatformSpecific(
DefaultInitializePlatformSpecific(data, kParameterCount);
}
void StringAtAsStringDescriptor::InitializePlatformSpecific(
CallInterfaceDescriptorData* data) {
DefaultInitializePlatformSpecific(data, kParameterCount);
}
void StringSubstringDescriptor::InitializePlatformSpecific(
CallInterfaceDescriptorData* data) {
DefaultInitializePlatformSpecific(data, kParameterCount);

View File

@ -74,6 +74,7 @@ namespace internal {
V(StoreTransition) \
V(StoreWithVector) \
V(StringAt) \
V(StringAtAsString) \
V(StringSubstring) \
V(TypeConversion) \
V(TypeConversionStackParameter) \
@ -969,6 +970,17 @@ class StringAtDescriptor final : public CallInterfaceDescriptor {
DECLARE_DESCRIPTOR(StringAtDescriptor, CallInterfaceDescriptor)
};
class StringAtAsStringDescriptor final : public CallInterfaceDescriptor {
public:
DEFINE_PARAMETERS(kReceiver, kPosition)
// TODO(turbofan): Return untagged value here.
DEFINE_RESULT_AND_PARAMETER_TYPES(
MachineType::TaggedPointer(), // result string
MachineType::AnyTagged(), // kReceiver
MachineType::IntPtr()) // kPosition
DECLARE_DESCRIPTOR(StringAtAsStringDescriptor, CallInterfaceDescriptor)
};
class StringSubstringDescriptor final : public CallInterfaceDescriptor {
public:
DEFINE_PARAMETERS(kString, kFrom, kTo)

View File

@ -154,13 +154,14 @@ class EffectControlLinearizer {
Node* LowerStringConcat(Node* node);
Node* LowerStringToNumber(Node* node);
Node* LowerStringCharCodeAt(Node* node);
Node* LowerStringCodePointAt(Node* node, UnicodeEncoding encoding);
Node* LowerStringCodePointAt(Node* node);
Node* LowerStringToLowerCaseIntl(Node* node);
Node* LowerStringToUpperCaseIntl(Node* node);
Node* LowerStringFromSingleCharCode(Node* node);
Node* LowerStringFromSingleCodePoint(Node* node);
Node* LowerStringIndexOf(Node* node);
Node* LowerStringSubstring(Node* node);
Node* LowerStringFromCodePointAt(Node* node);
Node* LowerStringLength(Node* node);
Node* LowerStringEqual(Node* node);
Node* LowerStringLessThan(Node* node);
@ -1127,6 +1128,9 @@ bool EffectControlLinearizer::TryWireInStateEffect(Node* node,
case IrOpcode::kStringIndexOf:
result = LowerStringIndexOf(node);
break;
case IrOpcode::kStringFromCodePointAt:
result = LowerStringFromCodePointAt(node);
break;
case IrOpcode::kStringLength:
result = LowerStringLength(node);
break;
@ -1137,7 +1141,7 @@ bool EffectControlLinearizer::TryWireInStateEffect(Node* node,
result = LowerStringCharCodeAt(node);
break;
case IrOpcode::kStringCodePointAt:
result = LowerStringCodePointAt(node, UnicodeEncodingOf(node->op()));
result = LowerStringCodePointAt(node);
break;
case IrOpcode::kStringToLowerCaseIntl:
result = LowerStringToLowerCaseIntl(node);
@ -3851,16 +3855,12 @@ Node* EffectControlLinearizer::LowerStringCharCodeAt(Node* node) {
return loop_done.PhiAt(0);
}
Node* EffectControlLinearizer::LowerStringCodePointAt(
Node* node, UnicodeEncoding encoding) {
Node* EffectControlLinearizer::LowerStringCodePointAt(Node* node) {
Node* receiver = node->InputAt(0);
Node* position = node->InputAt(1);
Builtins::Name builtin = encoding == UnicodeEncoding::UTF16
? Builtins::kStringCodePointAtUTF16
: Builtins::kStringCodePointAtUTF32;
Callable const callable = Builtins::CallableFor(isolate(), builtin);
Callable const callable =
Builtins::CallableFor(isolate(), Builtins::kStringCodePointAt);
Operator::Properties properties = Operator::kNoThrow | Operator::kNoWrite;
CallDescriptor::Flags flags = CallDescriptor::kNoFlags;
auto call_descriptor = Linkage::GetStubCallDescriptor(
@ -4093,31 +4093,23 @@ Node* EffectControlLinearizer::LowerStringFromSingleCodePoint(Node* node) {
__ Bind(&if_not_single_code);
// Generate surrogate pair string
{
switch (UnicodeEncodingOf(node->op())) {
case UnicodeEncoding::UTF16:
break;
// Convert UTF32 to UTF16 code units, and store as a 32 bit word.
Node* lead_offset = __ Int32Constant(0xD800 - (0x10000 >> 10));
case UnicodeEncoding::UTF32: {
// Convert UTF32 to UTF16 code units, and store as a 32 bit word.
Node* lead_offset = __ Int32Constant(0xD800 - (0x10000 >> 10));
// lead = (codepoint >> 10) + LEAD_OFFSET
Node* lead =
__ Int32Add(__ Word32Shr(code, __ Int32Constant(10)), lead_offset);
// lead = (codepoint >> 10) + LEAD_OFFSET
Node* lead =
__ Int32Add(__ Word32Shr(code, __ Int32Constant(10)), lead_offset);
// trail = (codepoint & 0x3FF) + 0xDC00;
Node* trail = __ Int32Add(__ Word32And(code, __ Int32Constant(0x3FF)),
__ Int32Constant(0xDC00));
// trail = (codepoint & 0x3FF) + 0xDC00;
Node* trail = __ Int32Add(__ Word32And(code, __ Int32Constant(0x3FF)),
__ Int32Constant(0xDC00));
// codpoint = (trail << 16) | lead;
// codpoint = (trail << 16) | lead;
#if V8_TARGET_BIG_ENDIAN
code = __ Word32Or(__ Word32Shl(lead, __ Int32Constant(16)), trail);
code = __ Word32Or(__ Word32Shl(lead, __ Int32Constant(16)), trail);
#else
code = __ Word32Or(__ Word32Shl(trail, __ Int32Constant(16)), lead);
code = __ Word32Or(__ Word32Shl(trail, __ Int32Constant(16)), lead);
#endif
break;
}
}
// Allocate a new SeqTwoByteString for {code}.
Node* vfalse0 =
@ -4157,6 +4149,21 @@ Node* EffectControlLinearizer::LowerStringIndexOf(Node* node) {
search_string, position, __ NoContextConstant());
}
Node* EffectControlLinearizer::LowerStringFromCodePointAt(Node* node) {
Node* string = node->InputAt(0);
Node* index = node->InputAt(1);
Callable callable =
Builtins::CallableFor(isolate(), Builtins::kStringFromCodePointAt);
Operator::Properties properties = Operator::kEliminatable;
CallDescriptor::Flags flags = CallDescriptor::kNoFlags;
auto call_descriptor = Linkage::GetStubCallDescriptor(
graph()->zone(), callable.descriptor(),
callable.descriptor().GetStackParameterCount(), flags, properties);
return __ Call(call_descriptor, __ HeapConstant(callable.code()), string,
index, __ NoContextConstant());
}
Node* EffectControlLinearizer::LowerStringLength(Node* node) {
Node* subject = node->InputAt(0);

View File

@ -3682,8 +3682,8 @@ Reduction JSCallReducer::ReduceJSCall(Node* node,
return ReduceStringPrototypeStringAt(simplified()->StringCharCodeAt(),
node);
case Builtins::kStringPrototypeCodePointAt:
return ReduceStringPrototypeStringAt(
simplified()->StringCodePointAt(UnicodeEncoding::UTF32), node);
return ReduceStringPrototypeStringAt(simplified()->StringCodePointAt(),
node);
case Builtins::kStringPrototypeSubstring:
return ReduceStringPrototypeSubstring(node);
case Builtins::kStringPrototypeSlice:
@ -5514,8 +5514,8 @@ Reduction JSCallReducer::ReduceStringFromCodePoint(Node* node) {
graph()->NewNode(simplified()->CheckBounds(p.feedback()), input,
jsgraph()->Constant(0x10FFFF + 1), effect, control);
Node* value = graph()->NewNode(
simplified()->StringFromSingleCodePoint(UnicodeEncoding::UTF32), input);
Node* value =
graph()->NewNode(simplified()->StringFromSingleCodePoint(), input);
ReplaceWithValue(node, value, effect);
return Replace(value);
}
@ -5571,12 +5571,8 @@ Reduction JSCallReducer::ReduceStringIteratorPrototypeNext(Node* node) {
Node* vtrue0;
{
done_true = jsgraph()->FalseConstant();
Node* codepoint = etrue0 = graph()->NewNode(
simplified()->StringCodePointAt(UnicodeEncoding::UTF16), string, index,
etrue0, if_true0);
vtrue0 = graph()->NewNode(
simplified()->StringFromSingleCodePoint(UnicodeEncoding::UTF16),
codepoint);
vtrue0 = etrue0 = graph()->NewNode(simplified()->StringFromCodePointAt(),
string, index, etrue0, if_true0);
// Update iterator.[[NextIndex]]
Node* char_length = graph()->NewNode(simplified()->StringLength(), vtrue0);

View File

@ -389,6 +389,7 @@
V(StringCodePointAt) \
V(StringFromSingleCharCode) \
V(StringFromSingleCodePoint) \
V(StringFromCodePointAt) \
V(StringIndexOf) \
V(StringLength) \
V(StringToLowerCaseIntl) \

View File

@ -2683,6 +2683,10 @@ class RepresentationSelector {
MachineRepresentation::kTaggedPointer);
return;
}
case IrOpcode::kStringFromCodePointAt: {
return VisitBinop(node, UseInfo::AnyTagged(), UseInfo::Word(),
MachineRepresentation::kTaggedPointer);
}
case IrOpcode::kStringIndexOf: {
ProcessInput(node, 0, UseInfo::AnyTagged());
ProcessInput(node, 1, UseInfo::AnyTagged());

View File

@ -616,12 +616,6 @@ Type AllocateTypeOf(const Operator* op) {
return AllocateParametersOf(op).type();
}
UnicodeEncoding UnicodeEncodingOf(const Operator* op) {
DCHECK(op->opcode() == IrOpcode::kStringFromSingleCodePoint ||
op->opcode() == IrOpcode::kStringCodePointAt);
return OpParameter<UnicodeEncoding>(op);
}
AbortReason AbortReasonOf(const Operator* op) {
DCHECK_EQ(IrOpcode::kRuntimeAbort, op->opcode());
return static_cast<AbortReason>(OpParameter<int>(op));
@ -736,6 +730,7 @@ bool operator==(CheckMinusZeroParameters const& lhs,
V(StringConcat, Operator::kNoProperties, 3, 0) \
V(StringToNumber, Operator::kNoProperties, 1, 0) \
V(StringFromSingleCharCode, Operator::kNoProperties, 1, 0) \
V(StringFromSingleCodePoint, Operator::kNoProperties, 1, 0) \
V(StringIndexOf, Operator::kNoProperties, 3, 0) \
V(StringLength, Operator::kNoProperties, 1, 0) \
V(StringToLowerCaseIntl, Operator::kNoProperties, 1, 0) \
@ -802,9 +797,11 @@ bool operator==(CheckMinusZeroParameters const& lhs,
V(NewConsString, Operator::kNoProperties, 3, 0) \
V(PoisonIndex, Operator::kNoProperties, 1, 0)
#define EFFECT_DEPENDENT_OP_LIST(V) \
V(StringCharCodeAt, Operator::kNoProperties, 2, 1) \
V(StringSubstring, Operator::kNoProperties, 3, 1) \
#define EFFECT_DEPENDENT_OP_LIST(V) \
V(StringCharCodeAt, Operator::kNoProperties, 2, 1) \
V(StringCodePointAt, Operator::kNoProperties, 2, 1) \
V(StringFromCodePointAt, Operator::kNoProperties, 2, 1) \
V(StringSubstring, Operator::kNoProperties, 3, 1) \
V(DateNow, Operator::kNoProperties, 0, 1)
#define SPECULATIVE_NUMBER_BINOP_LIST(V) \
@ -929,32 +926,6 @@ struct SimplifiedOperatorGlobalCache final {
DEOPTIMIZE_REASON_LIST(CHECK_IF)
#undef CHECK_IF
template <UnicodeEncoding kEncoding>
struct StringCodePointAtOperator final : public Operator1<UnicodeEncoding> {
StringCodePointAtOperator()
: Operator1<UnicodeEncoding>(IrOpcode::kStringCodePointAt,
Operator::kFoldable | Operator::kNoThrow,
"StringCodePointAt", 2, 1, 1, 1, 1, 0,
kEncoding) {}
};
StringCodePointAtOperator<UnicodeEncoding::UTF16>
kStringCodePointAtOperatorUTF16;
StringCodePointAtOperator<UnicodeEncoding::UTF32>
kStringCodePointAtOperatorUTF32;
template <UnicodeEncoding kEncoding>
struct StringFromSingleCodePointOperator final
: public Operator1<UnicodeEncoding> {
StringFromSingleCodePointOperator()
: Operator1<UnicodeEncoding>(
IrOpcode::kStringFromSingleCodePoint, Operator::kPure,
"StringFromSingleCodePoint", 1, 0, 0, 1, 0, 0, kEncoding) {}
};
StringFromSingleCodePointOperator<UnicodeEncoding::UTF16>
kStringFromSingleCodePointOperatorUTF16;
StringFromSingleCodePointOperator<UnicodeEncoding::UTF32>
kStringFromSingleCodePointOperatorUTF32;
struct FindOrderedHashMapEntryOperator final : public Operator {
FindOrderedHashMapEntryOperator()
: Operator(IrOpcode::kFindOrderedHashMapEntry, Operator::kEliminatable,
@ -1704,28 +1675,6 @@ const Operator* SimplifiedOperatorBuilder::AllocateRaw(
AllocateParameters(type, allocation, allow_large_objects));
}
const Operator* SimplifiedOperatorBuilder::StringCodePointAt(
UnicodeEncoding encoding) {
switch (encoding) {
case UnicodeEncoding::UTF16:
return &cache_.kStringCodePointAtOperatorUTF16;
case UnicodeEncoding::UTF32:
return &cache_.kStringCodePointAtOperatorUTF32;
}
UNREACHABLE();
}
const Operator* SimplifiedOperatorBuilder::StringFromSingleCodePoint(
UnicodeEncoding encoding) {
switch (encoding) {
case UnicodeEncoding::UTF16:
return &cache_.kStringFromSingleCodePointOperatorUTF16;
case UnicodeEncoding::UTF32:
return &cache_.kStringFromSingleCodePointOperatorUTF32;
}
UNREACHABLE();
}
#define SPECULATIVE_NUMBER_BINOP(Name) \
const Operator* SimplifiedOperatorBuilder::Name(NumberOperationHint hint) { \
switch (hint) { \

View File

@ -697,9 +697,10 @@ class V8_EXPORT_PRIVATE SimplifiedOperatorBuilder final
const Operator* StringLessThan();
const Operator* StringLessThanOrEqual();
const Operator* StringCharCodeAt();
const Operator* StringCodePointAt(UnicodeEncoding encoding);
const Operator* StringCodePointAt();
const Operator* StringFromSingleCharCode();
const Operator* StringFromSingleCodePoint(UnicodeEncoding encoding);
const Operator* StringFromSingleCodePoint();
const Operator* StringFromCodePointAt();
const Operator* StringIndexOf();
const Operator* StringLength();
const Operator* StringToLowerCaseIntl();

View File

@ -2068,6 +2068,10 @@ Type Typer::Visitor::TypeStringFromSingleCodePoint(Node* node) {
return TypeUnaryOp(node, StringFromSingleCodePointTyper);
}
Type Typer::Visitor::TypeStringFromCodePointAt(Node* node) {
return Type::String();
}
Type Typer::Visitor::TypeStringIndexOf(Node* node) {
return Type::Range(-1.0, String::kMaxLength, zone());
}

View File

@ -1164,6 +1164,12 @@ void Verifier::Visitor::Check(Node* node, const AllNodes& all) {
CheckValueInputIs(node, 0, Type::Number());
CheckTypeIs(node, Type::String());
break;
case IrOpcode::kStringFromCodePointAt:
// (String, Unsigned32) -> UnsignedSmall
CheckValueInputIs(node, 0, Type::String());
CheckValueInputIs(node, 1, Type::Unsigned32());
CheckTypeIs(node, Type::String());
break;
case IrOpcode::kStringIndexOf:
// (String, String, SignedSmall) -> SignedSmall
CheckValueInputIs(node, 0, Type::String());