[stringrefs] Add wtf8_policy immediate to string.new_wtf8

Following change in https://github.com/WebAssembly/stringref/pull/22.
This adds two new parsing modes: a strict UTF-8 parsing mode, and a
sloppy mode that should replace invalid subsequences with U+FFFD.

Bug: v8:12868
Change-Id: I03bd8d2a3408c399ce68f7b150d7650908804113
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3719919
Reviewed-by: Jakob Kummerow <jkummerow@chromium.org>
Commit-Queue: Andy Wingo <wingo@igalia.com>
Cr-Commit-Position: refs/heads/main@{#81337}
This commit is contained in:
Andy Wingo 2022-06-23 15:17:34 +02:00 committed by V8 LUCI CQ
parent 118dff9dcd
commit cf8fc47445
17 changed files with 192 additions and 58 deletions

View File

@ -39,7 +39,7 @@ extern runtime WasmArrayCopy(
extern runtime WasmArrayInitFromSegment(
Context, WasmInstanceObject, Smi, Smi, Smi, Map): Object;
extern runtime WasmStringNewWtf8(
Context, WasmInstanceObject, Smi, Number, Number): String;
Context, WasmInstanceObject, Smi, Smi, Number, Number): String;
extern runtime WasmStringNewWtf16(
Context, WasmInstanceObject, Smi, Number, Number): String;
extern runtime WasmStringConst(Context, WasmInstanceObject, Smi): String;
@ -798,10 +798,10 @@ transitioning javascript builtin ExperimentalWasmConvertStringToArray(
}
builtin WasmStringNewWtf8(
memory: uint32, offset: uint32, size: uint32): String {
offset: uint32, size: uint32, memory: Smi, policy: Smi): String {
const instance = LoadInstanceFromFrame();
tail runtime::WasmStringNewWtf8(
LoadContextFromInstance(instance), instance, SmiFromUint32(memory),
LoadContextFromInstance(instance), instance, memory, policy,
WasmUint32ToNumber(offset), WasmUint32ToNumber(size));
}
builtin WasmStringNewWtf16(

View File

@ -646,6 +646,7 @@ namespace internal {
T(WasmTrapIllegalCast, "illegal cast") \
T(WasmTrapArrayOutOfBounds, "array element access out of bounds") \
T(WasmTrapArrayTooLarge, "requested new array is too large") \
T(WasmTrapStringInvalidUtf8, "invalid UTF-8 string") \
T(WasmTrapStringInvalidWtf8, "invalid WTF-8 string") \
T(WasmTrapStringOffsetOutOfBounds, "string offset out of bounds") \
T(WasmTrapStringIsolatedSurrogate, \

View File

@ -5755,10 +5755,12 @@ void WasmGraphBuilder::ArrayCopy(Node* dst_array, Node* dst_index,
gasm_->Bind(&skip);
}
Node* WasmGraphBuilder::StringNewWtf8(uint32_t memory, Node* offset,
Node* size) {
Node* WasmGraphBuilder::StringNewWtf8(uint32_t memory,
wasm::StringRefWtf8Policy policy,
Node* offset, Node* size) {
return gasm_->CallBuiltin(Builtin::kWasmStringNewWtf8, Operator::kNoDeopt,
gasm_->Uint32Constant(memory), offset, size);
offset, size, gasm_->SmiConstant(memory),
gasm_->SmiConstant(static_cast<int32_t>(policy)));
}
Node* WasmGraphBuilder::StringNewWtf16(uint32_t memory, Node* offset,

View File

@ -535,7 +535,8 @@ class WasmGraphBuilder {
void BrOnI31(Node* object, Node* rtt, WasmTypeCheckConfig config,
Node** match_control, Node** match_effect,
Node** no_match_control, Node** no_match_effect);
Node* StringNewWtf8(uint32_t memory, Node* offset, Node* size);
Node* StringNewWtf8(uint32_t memory, wasm::StringRefWtf8Policy policy,
Node* offset, Node* size);
Node* StringNewWtf16(uint32_t memory, Node* offset, Node* size);
Node* StringConst(uint32_t index);
Node* StringMeasureUtf8(Node* string, CheckForNull null_check,

View File

@ -768,6 +768,20 @@ MaybeHandle<String> Factory::NewStringFromWtf8(
return NewStringFromBytes<Wtf8Decoder>(isolate(), string, allocation,
handler);
}
MaybeHandle<String> Factory::NewStringFromStrictUtf8(
const base::Vector<const uint8_t>& string, AllocationType allocation) {
auto handler = [&]() {
Handle<JSObject> error_obj =
NewWasmRuntimeError(MessageTemplate::kWasmTrapStringInvalidUtf8);
JSObject::AddProperty(isolate(), error_obj, wasm_uncatchable_symbol(),
true_value(), NONE);
isolate()->Throw(*error_obj);
};
return NewStringFromBytes<StrictUtf8Decoder>(
isolate(), base::Vector<const uint8_t>::cast(string), allocation,
handler);
}
#endif // V8_ENABLE_WEBASSEMBLY
MaybeHandle<String> Factory::NewStringFromUtf8SubString(

View File

@ -265,6 +265,11 @@ class V8_EXPORT_PRIVATE Factory : public FactoryBase<Factory> {
V8_WARN_UNUSED_RESULT MaybeHandle<String> NewStringFromWtf8(
const base::Vector<const uint8_t>& str,
AllocationType allocation = AllocationType::kYoung);
// The NewStringFromUtf8 function will replace any decoding error with U+FFFD
// (the replacement character). This function will trap instead.
V8_WARN_UNUSED_RESULT MaybeHandle<String> NewStringFromStrictUtf8(
const base::Vector<const uint8_t>& str,
AllocationType allocation = AllocationType::kYoung);
#endif // V8_ENABLE_WEBASSEMBLY
V8_WARN_UNUSED_RESULT MaybeHandle<String> NewStringFromUtf8SubString(

View File

@ -861,15 +861,19 @@ RUNTIME_FUNCTION(Runtime_WasmCreateResumePromise) {
// exception and returns an empty result.
RUNTIME_FUNCTION(Runtime_WasmStringNewWtf8) {
ClearThreadInWasmScope flag_scope(isolate);
DCHECK_EQ(4, args.length());
DCHECK_EQ(5, args.length());
HandleScope scope(isolate);
Handle<WasmInstanceObject> instance = args.at<WasmInstanceObject>(0);
uint32_t memory = args.positive_smi_value_at(1);
uint32_t offset = NumberToUint32(args[2]);
uint32_t size = NumberToUint32(args[3]);
uint32_t policy_value = args.positive_smi_value_at(2);
uint32_t offset = NumberToUint32(args[3]);
uint32_t size = NumberToUint32(args[4]);
DCHECK_EQ(memory, 0);
USE(memory);
DCHECK(policy_value <= wasm::kLastWtf8Policy);
auto policy = static_cast<wasm::StringRefWtf8Policy>(policy_value);
uint64_t mem_size = instance->memory_size();
if (!base::IsInBounds<uint64_t>(offset, size, mem_size)) {
@ -880,8 +884,22 @@ RUNTIME_FUNCTION(Runtime_WasmStringNewWtf8) {
size};
// TODO(12868): Override any exception with an uncatchable-by-wasm trap.
Handle<String> result;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewStringFromWtf8(bytes));
switch (policy) {
case wasm::kWtf8PolicyReject:
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewStringFromStrictUtf8(bytes));
break;
case wasm::kWtf8PolicyAccept:
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewStringFromWtf8(bytes));
break;
case wasm::kWtf8PolicyReplace: {
auto string = base::Vector<const char>::cast(bytes);
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, result, isolate->factory()->NewStringFromUtf8(string));
break;
}
}
return *result;
}

View File

@ -612,7 +612,7 @@ namespace internal {
F(WasmAllocateContinuation, 1, 1) \
F(WasmSyncStackLimit, 0, 1) \
F(WasmCreateResumePromise, 2, 1) \
F(WasmStringNewWtf8, 4, 1) \
F(WasmStringNewWtf8, 5, 1) \
F(WasmStringNewWtf16, 4, 1) \
F(WasmStringConst, 2, 1) \
F(WasmStringMeasureUtf8, 1, 1) \

View File

@ -137,6 +137,15 @@ template void Utf8DecoderBase<Wtf8Decoder>::Decode(
template void Utf8DecoderBase<Wtf8Decoder>::Decode(
uint16_t* out, const base::Vector<const uint8_t>& data);
template Utf8DecoderBase<StrictUtf8Decoder>::Utf8DecoderBase(
const base::Vector<const uint8_t>& data);
template void Utf8DecoderBase<StrictUtf8Decoder>::Decode(
uint8_t* out, const base::Vector<const uint8_t>& data);
template void Utf8DecoderBase<StrictUtf8Decoder>::Decode(
uint16_t* out, const base::Vector<const uint8_t>& data);
#endif // V8_ENABLE_WEBASSEMBLY
} // namespace internal

View File

@ -107,7 +107,8 @@ class V8_EXPORT_PRIVATE Utf8Decoder final
#if V8_ENABLE_WEBASSEMBLY
// Like Utf8Decoder above, except that instead of replacing invalid sequences
// with U+FFFD, we have a separate Encoding::kInvalid state.
// with U+FFFD, we have a separate Encoding::kInvalid state, and we also accept
// isolated surrogates.
class Wtf8Decoder : public Utf8DecoderBase<Wtf8Decoder> {
public:
static bool InvalidCodePointSequence(uint32_t current, uint32_t previous) {
@ -121,6 +122,26 @@ class Wtf8Decoder : public Utf8DecoderBase<Wtf8Decoder> {
bool is_invalid() const { return encoding_ == Encoding::kInvalid; }
};
// Like Utf8Decoder above, except that instead of replacing invalid sequences
// with U+FFFD, we have a separate Encoding::kInvalid state.
class StrictUtf8Decoder : public Utf8DecoderBase<StrictUtf8Decoder> {
public:
static bool InvalidCodePointSequence(uint32_t current, uint32_t previous) {
// The DfaDecoder will only ever decode Unicode scalar values, and all
// sequence of USVs are valid.
DCHECK(!unibrow::Utf16::IsLeadSurrogate(current));
DCHECK(!unibrow::Utf16::IsTrailSurrogate(current));
return false;
}
static const bool kAllowIncompleteSequences = false;
using DfaDecoder = Utf8DfaDecoder;
explicit StrictUtf8Decoder(const base::Vector<const uint8_t>& data)
: Utf8DecoderBase(data) {}
bool is_invalid() const { return encoding_ == Encoding::kInvalid; }
};
#endif // V8_ENABLE_WEBASSEMBLY
} // namespace internal

View File

@ -6011,22 +6011,30 @@ class LiftoffCompiler {
}
void StringNewWtf8(FullDecoder* decoder,
const MemoryIndexImmediate<validate>& imm,
const EncodeWtf8Immediate<validate>& imm,
const Value& offset, const Value& size, Value* result) {
LiftoffRegList pinned;
LiftoffRegister memory_reg =
pinned.set(__ GetUnusedRegister(kGpReg, pinned));
__ LoadConstant(memory_reg, WasmValue(static_cast<int32_t>(imm.index)));
LiftoffAssembler::VarState memory_var(kI32, memory_reg, 0);
LoadSmi(memory_reg, imm.memory.index);
LiftoffAssembler::VarState memory_var(kSmiKind, memory_reg, 0);
CallRuntimeStub(WasmCode::kWasmStringNewWtf8,
MakeSig::Returns(kRef).Params(kI32, kI32, kI32),
{
memory_var,
__ cache_state()->stack_state.end()[-2], // offset
__ cache_state()->stack_state.end()[-1] // size
},
decoder->position());
LiftoffRegister policy_reg =
pinned.set(__ GetUnusedRegister(kGpReg, pinned));
LoadSmi(policy_reg, static_cast<int32_t>(imm.policy.value));
LiftoffAssembler::VarState policy_var(kSmiKind, policy_reg, 0);
CallRuntimeStub(
WasmCode::kWasmStringNewWtf8,
MakeSig::Returns(kRef).Params(kI32, kI32, kSmiKind, kSmiKind),
{
__ cache_state()->stack_state.end()[-2], // offset
__ cache_state()->stack_state.end()[-1], // size
memory_var,
policy_var,
},
decoder->position());
__ cache_state()->stack_state.pop_back(2);
RegisterDebugSideTableEntry(decoder, DebugSideTableBuilder::kDidSpill);
@ -6135,12 +6143,12 @@ class LiftoffCompiler {
LiftoffRegister memory_reg =
pinned.set(__ GetUnusedRegister(kGpReg, pinned));
LoadSmi(memory_reg, imm.memory.index);
LiftoffAssembler::VarState memory_var(kPointerKind, memory_reg, 0);
LiftoffAssembler::VarState memory_var(kSmiKind, memory_reg, 0);
LiftoffRegister policy_reg =
pinned.set(__ GetUnusedRegister(kGpReg, pinned));
LoadSmi(policy_reg, static_cast<int32_t>(imm.policy.value));
LiftoffAssembler::VarState policy_var(kPointerKind, policy_reg, 0);
LiftoffAssembler::VarState policy_var(kSmiKind, policy_reg, 0);
CallRuntimeStub(WasmCode::kWasmStringEncodeWtf8,
MakeSig::Params(kRef, kI32, kSmiKind, kSmiKind),
@ -6171,7 +6179,7 @@ class LiftoffCompiler {
LiftoffRegister memory_reg =
pinned.set(__ GetUnusedRegister(kGpReg, pinned));
LoadSmi(memory_reg, imm.index);
LiftoffAssembler::VarState memory_var(kPointerKind, memory_reg, 0);
LiftoffAssembler::VarState memory_var(kSmiKind, memory_reg, 0);
CallRuntimeStub(WasmCode::kWasmStringEncodeWtf16,
MakeSig::Params(kRef, kI32, kSmiKind),
@ -6322,7 +6330,7 @@ class LiftoffCompiler {
LiftoffRegister memory_reg =
pinned.set(__ GetUnusedRegister(kGpReg, pinned));
LoadSmi(memory_reg, imm.index);
LiftoffAssembler::VarState memory_var(kPointerKind, memory_reg, 0);
LiftoffAssembler::VarState memory_var(kSmiKind, memory_reg, 0);
CallRuntimeStub(WasmCode::kWasmStringViewWtf16Encode,
MakeSig::Params(kI32, kI32, kI32, kRef, kSmiKind),

View File

@ -1133,7 +1133,7 @@ struct ControlBase : public PcForErrors<validate> {
uint32_t br_depth) \
F(BrOnNonArray, const Value& object, Value* value_on_fallthrough, \
uint32_t br_depth) \
F(StringNewWtf8, const MemoryIndexImmediate<validate>& imm, \
F(StringNewWtf8, const EncodeWtf8Immediate<validate>& imm, \
const Value& offset, const Value& size, Value* result) \
F(StringNewWtf16, const MemoryIndexImmediate<validate>& imm, \
const Value& offset, const Value& size, Value* result) \
@ -2012,13 +2012,13 @@ class WasmDecoder : public Decoder {
case kExprRefTest:
case kExprRefCast:
return length;
case kExprStringNewWtf8:
case kExprStringNewWtf16:
case kExprStringEncodeWtf16:
case kExprStringViewWtf16Encode: {
MemoryIndexImmediate<validate> imm(decoder, pc + length);
return length + imm.length;
}
case kExprStringNewWtf8:
case kExprStringEncodeWtf8:
case kExprStringViewWtf8Encode: {
EncodeWtf8Immediate<validate> imm(decoder, pc + length);
@ -5190,7 +5190,7 @@ class WasmFullDecoder : public WasmDecoder<validate, decoding_mode> {
switch (opcode) {
case kExprStringNewWtf8: {
NON_CONST_ONLY
MemoryIndexImmediate<validate> imm(this, this->pc_ + opcode_length);
EncodeWtf8Immediate<validate> imm(this, this->pc_ + opcode_length);
if (!this->Validate(this->pc_ + opcode_length, imm)) return 0;
ValueType addr_type = this->module_->is_memory64 ? kWasmI64 : kWasmI32;
Value offset = Peek(1, 0, addr_type);

View File

@ -1397,9 +1397,10 @@ class WasmGraphBuildingInterface {
}
void StringNewWtf8(FullDecoder* decoder,
const MemoryIndexImmediate<validate>& imm,
const EncodeWtf8Immediate<validate>& imm,
const Value& offset, const Value& size, Value* result) {
result->node = builder_->StringNewWtf8(imm.index, offset.node, size.node);
result->node = builder_->StringNewWtf8(imm.memory.index, imm.policy.value,
offset.node, size.node);
}
void StringNewWtf16(FullDecoder* decoder,

View File

@ -53,6 +53,25 @@ let interestingStrings = ['',
'isolated \ud800 leading',
'isolated \udc00 trailing'];
function IsSurrogate(codepoint) {
return 0xD800 <= codepoint && codepoint <= 0xDFFF
}
function HasIsolatedSurrogate(str) {
for (let codepoint of str) {
let value = codepoint.codePointAt(0);
if (IsSurrogate(value)) return true;
}
return false;
}
function ReplaceIsolatedSurrogates(str, replacement='\ufffd') {
let replaced = '';
for (let codepoint of str) {
replaced +=
IsSurrogate(codepoint.codePointAt(0)) ? replacement : codepoint;
}
return replaced;
}
function makeWtf8TestDataSegment() {
let data = []
let valid = {};
@ -85,20 +104,54 @@ function makeWtf8TestDataSegment() {
let data = makeWtf8TestDataSegment();
builder.addDataSegment(0, data.data);
builder.addFunction("string_new_utf8", kSig_w_ii)
.exportFunc()
.addBody([
kExprLocalGet, 0, kExprLocalGet, 1,
kGCPrefix, kExprStringNewWtf8, 0, kWtf8PolicyReject
]);
builder.addFunction("string_new_wtf8", kSig_w_ii)
.exportFunc()
.addBody([
kExprLocalGet, 0, kExprLocalGet, 1,
kGCPrefix, kExprStringNewWtf8, 0
kGCPrefix, kExprStringNewWtf8, 0, kWtf8PolicyAccept
]);
builder.addFunction("string_new_utf8_sloppy", kSig_w_ii)
.exportFunc()
.addBody([
kExprLocalGet, 0, kExprLocalGet, 1,
kGCPrefix, kExprStringNewWtf8, 0, kWtf8PolicyReplace
]);
let instance = builder.instantiate();
for (let [str, {offset, length}] of Object.entries(data.valid)) {
assertEquals(str, instance.exports.string_new_wtf8(offset, length));
if (HasIsolatedSurrogate(str)) {
assertThrows(() => instance.exports.string_new_utf8(offset, length),
WebAssembly.RuntimeError, "invalid UTF-8 string");
// Isolated surrogates have the three-byte pattern ED [A0,BF]
// [80,BF]. When the sloppy decoder gets to the second byte, it
// will reject the sequence, and then retry parsing at the second
// byte. Seeing the second byte can't start a sequence, it
// replaces the second byte and continues with the next, which
// also can't start a sequence. The result is that one isolated
// surrogate is replaced by three U+FFFD codepoints.
assertEquals(ReplaceIsolatedSurrogates(str, '\ufffd\ufffd\ufffd'),
instance.exports.string_new_utf8_sloppy(offset, length));
} else {
assertEquals(str, instance.exports.string_new_utf8(offset, length));
assertEquals(str,
instance.exports.string_new_utf8_sloppy(offset, length));
}
}
for (let [str, {offset, length}] of Object.entries(data.invalid)) {
assertThrows(() => instance.exports.string_new_wtf8(offset, length),
WebAssembly.RuntimeError, "invalid WTF-8 string");
assertThrows(() => instance.exports.string_new_utf8(offset, length),
WebAssembly.RuntimeError, "invalid UTF-8 string");
}
})();
@ -168,17 +221,6 @@ function makeWtf16TestDataSegment() {
}
})();
function IsSurrogate(codepoint) {
return 0xD800 <= codepoint && codepoint <= 0xDFFF
}
function HasIsolatedSurrogate(str) {
for (let codepoint of str) {
let value = codepoint.codePointAt(0);
if (IsSurrogate(value)) return true;
}
return false;
}
(function TestStringMeasureUtf8AndWtf8() {
let builder = new WasmModuleBuilder();
@ -342,12 +384,7 @@ function HasIsolatedSurrogate(str) {
for (let str of interestingStrings) {
let offset = 42;
instance.exports.encode_replace(str, offset);
let replaced = '';
for (let codepoint of str) {
codepoint = codepoint.codePointAt(0);
if (IsSurrogate(codepoint)) codepoint = 0xFFFD;
replaced += String.fromCodePoint(codepoint);
}
let replaced = ReplaceIsolatedSurrogates(str);
if (!HasIsolatedSurrogate(str)) assertEquals(str, replaced);
let wtf8 = encodeWtf8(replaced);
checkMemory(offset, wtf8);

View File

@ -73,10 +73,22 @@ let kSig_w_zi = makeSig([kWasmStringViewIter, kWasmI32],
builder.addMemory(0, undefined, false, false);
builder.addFunction("string.new_wtf8", kSig_w_ii)
builder.addFunction("string.new_wtf8/reject", kSig_w_ii)
.addBody([
kExprLocalGet, 0, kExprLocalGet, 1,
kGCPrefix, kExprStringNewWtf8, 0
kGCPrefix, kExprStringNewWtf8, 0, kWtf8PolicyReject
]);
builder.addFunction("string.new_wtf8/accept", kSig_w_ii)
.addBody([
kExprLocalGet, 0, kExprLocalGet, 1,
kGCPrefix, kExprStringNewWtf8, 0, kWtf8PolicyAccept
]);
builder.addFunction("string.new_wtf8/replace", kSig_w_ii)
.addBody([
kExprLocalGet, 0, kExprLocalGet, 1,
kGCPrefix, kExprStringNewWtf8, 0, kWtf8PolicyReplace
]);
builder.addFunction("string.new_wtf16", kSig_w_ii)
@ -263,7 +275,7 @@ assertInvalid(
builder.addFunction("string.new_wtf8/no-mem", kSig_w_ii)
.addBody([
kExprLocalGet, 0, kExprLocalGet, 1,
kGCPrefix, kExprStringNewWtf8, 0
kGCPrefix, kExprStringNewWtf8, 0, kWtf8PolicyAccept
]);
},
"Compiling function #0:\"string.new_wtf8/no-mem\" failed: " +
@ -275,7 +287,7 @@ assertInvalid(
builder.addFunction("string.new_wtf8/bad-mem", kSig_w_ii)
.addBody([
kExprLocalGet, 0, kExprLocalGet, 1,
kGCPrefix, kExprStringNewWtf8, 1
kGCPrefix, kExprStringNewWtf8, 1, kWtf8PolicyAccept
]);
},
"Compiling function #0:\"string.new_wtf8/bad-mem\" failed: " +

View File

@ -886,6 +886,11 @@ let kExprI32x4TruncSatF64x2UZero = 0xfd;
let kExprF64x2ConvertLowI32x4S = 0xfe;
let kExprF64x2ConvertLowI32x4U = 0xff;
// WTF-8 parsing policies.
let kWtf8PolicyReject = 0;
let kWtf8PolicyAccept = 1;
let kWtf8PolicyReplace = 2;
// Compilation hint constants.
let kCompilationHintStrategyDefault = 0x00;
let kCompilationHintStrategyLazy = 0x01;

View File

@ -4838,8 +4838,8 @@ TEST_F(WasmOpcodeLengthTest, GCOpcodes) {
ExpectLength(3, 0xfb, 0x01, 0x42);
ExpectLength(4, 0xfb, 0x01, 0x80, 0x00);
// string.new_wtf8 with $mem=0.
ExpectLength(3, 0xfb, 0x80, 0x00);
// string.new_wtf8 with $mem=0, $policy=0.
ExpectLength(4, 0xfb, 0x80, 0x00, 0x00);
// string.as_wtf8.
ExpectLength(2, 0xfb, 0x90);