Reland "[wasm][liftoff] Cache the memory start register"

On a loop back edge both the cached instance and the cached memory
start have to get restored for the next loop iteration. In the original
CL we did not consider the case that by restoring the instance we may
overwrite the currently cached memory start.

Original description:

WebAssembly functions often have subsequent memory accesses, and each of
these memory accesses need the start address of the memory in a register.
With this CL the register with the memory start address is cached, so
only the first memory access has to load the memory start address into a
register, subsequent memory accesses can just reuse the register.

In first measurements with the epic benchmark this reduces the size of
the generated Liftoff code by a bit more than 5%.

R=clemensb@chromium.org

Bug: v8:11862
Change-Id: I884c0da24be8bc6b10f2c6bf5437b9a279819538
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2960220
Reviewed-by: Clemens Backes <clemensb@chromium.org>
Commit-Queue: Andreas Haas <ahaas@chromium.org>
Cr-Commit-Position: refs/heads/master@{#75183}
This commit is contained in:
Andreas Haas 2021-06-16 14:19:24 +02:00 committed by V8 LUCI CQ
parent 6e6aa89579
commit 274aaaafa0
3 changed files with 133 additions and 48 deletions

View File

@ -15,6 +15,7 @@
#include "src/utils/ostreams.h"
#include "src/wasm/baseline/liftoff-register.h"
#include "src/wasm/function-body-decoder-impl.h"
#include "src/wasm/object-access.h"
#include "src/wasm/wasm-linkage.h"
#include "src/wasm/wasm-opcodes.h"
@ -446,6 +447,10 @@ void LiftoffAssembler::CacheState::InitMerge(const CacheState& source,
SetInstanceCacheRegister(source.cached_instance);
}
if (source.cached_mem_start != no_reg) {
SetMemStartCacheRegister(source.cached_mem_start);
}
uint32_t stack_base = stack_depth + num_locals;
uint32_t target_height = stack_base + arity;
uint32_t discarded = source.stack_height() - target_height;
@ -709,10 +714,13 @@ void LiftoffAssembler::MergeFullStackWith(CacheState& target,
}
// Full stack merging is only done for forward jumps, so we can just clear the
// instance cache register at the target in case of mismatch.
// cache registers at the target in case of mismatch.
if (source.cached_instance != target.cached_instance) {
target.ClearCachedInstanceRegister();
}
if (source.cached_mem_start != target.cached_mem_start) {
target.ClearCachedMemStartRegister();
}
}
void LiftoffAssembler::MergeStackWith(CacheState& target, uint32_t arity,
@ -752,6 +760,39 @@ void LiftoffAssembler::MergeStackWith(CacheState& target, uint32_t arity,
Move(target.cached_instance, cache_state_.cached_instance,
kPointerKind);
}
if (target.cached_instance == cache_state_.cached_mem_start) {
// We just overwrote the cached mem-start register, so we have to tell
// the cache state about it.
cache_state_.ClearCachedMemStartRegister();
}
}
}
if (cache_state_.cached_mem_start != target.cached_mem_start &&
target.cached_mem_start != no_reg) {
if (jump_direction == kForwardJump) {
// On forward jumps, reset the cached memory start in the target state.
target.ClearCachedMemStartRegister();
} else {
// On backward jumps, we already generated code assuming that the
// memory start is available in that register. Thus move it there.
if (cache_state_.cached_mem_start == no_reg) {
// {target.cached_instance} already got restored above, so we can use it
// if it exists.
Register instance = target.cached_instance;
if (instance == no_reg) {
// We don't have the instance available yet. Store it into the target
// mem_start, so that we can load the mem_start from there.
instance = target.cached_mem_start;
LoadInstanceFromFrame(instance);
}
LoadFromInstance(
target.cached_mem_start, instance,
ObjectAccess::ToTagged(WasmInstanceObject::kMemoryStartOffset),
sizeof(size_t));
} else {
Move(target.cached_mem_start, cache_state_.cached_mem_start,
kPointerKind);
}
}
}
}
@ -784,7 +825,7 @@ void LiftoffAssembler::SpillAllRegisters() {
Spill(slot.offset(), slot.reg(), slot.kind());
slot.MakeStack();
}
cache_state_.ClearCachedInstanceRegister();
cache_state_.ClearAllCacheRegisters();
cache_state_.reset_used_registers();
}
@ -793,9 +834,21 @@ void LiftoffAssembler::ClearRegister(
LiftoffRegList pinned) {
if (reg == cache_state()->cached_instance) {
cache_state()->ClearCachedInstanceRegister();
// We can return immediately. The instance is only used to load information
// at the beginning of an instruction when values don't have to be in
// specific registers yet. Therefore the instance should never be one of the
// {possible_uses}.
for (Register* use : possible_uses) {
USE(use);
DCHECK_NE(reg, *use);
}
return;
}
if (cache_state()->is_used(LiftoffRegister(reg))) {
} else if (reg == cache_state()->cached_mem_start) {
cache_state()->ClearCachedMemStartRegister();
// The memory start may be among the {possible_uses}, e.g. for an atomic
// compare exchange. Therefore it is necessary to iterate over the
// {possible_uses} below, and we cannot return early.
} else if (cache_state()->is_used(LiftoffRegister(reg))) {
SpillRegister(LiftoffRegister(reg));
}
Register replacement = no_reg;
@ -891,7 +944,7 @@ void LiftoffAssembler::PrepareCall(const ValueKindSig* sig,
constexpr size_t kInputShift = 1;
// Spill all cache slots which are not being used as parameters.
cache_state_.ClearCachedInstanceRegister();
cache_state_.ClearAllCacheRegisters();
for (VarState* it = cache_state_.stack_state.end() - 1 - num_params;
it >= cache_state_.stack_state.begin() &&
!cache_state_.used_registers.is_empty();
@ -1125,13 +1178,15 @@ bool LiftoffAssembler::ValidateCacheState() const {
}
used_regs.set(reg);
}
if (cache_state_.cached_instance != no_reg) {
DCHECK(!used_regs.has(cache_state_.cached_instance));
int liftoff_code =
LiftoffRegister{cache_state_.cached_instance}.liftoff_code();
used_regs.set(cache_state_.cached_instance);
DCHECK_EQ(0, register_use_count[liftoff_code]);
register_use_count[liftoff_code] = 1;
for (Register cache_reg :
{cache_state_.cached_instance, cache_state_.cached_mem_start}) {
if (cache_reg != no_reg) {
DCHECK(!used_regs.has(cache_reg));
int liftoff_code = LiftoffRegister{cache_reg}.liftoff_code();
used_regs.set(cache_reg);
DCHECK_EQ(0, register_use_count[liftoff_code]);
register_use_count[liftoff_code] = 1;
}
}
bool valid = memcmp(register_use_count, cache_state_.register_use_count,
sizeof(register_use_count)) == 0 &&

View File

@ -200,6 +200,7 @@ class LiftoffAssembler : public TurboAssembler {
uint32_t register_use_count[kAfterMaxLiftoffRegCode] = {0};
LiftoffRegList last_spilled_regs;
Register cached_instance = no_reg;
Register cached_mem_start = no_reg;
bool has_unused_register(RegClass rc, LiftoffRegList pinned = {}) const {
if (kNeedI64RegPair && rc == kGpRegPair) {
@ -250,31 +251,47 @@ class LiftoffAssembler : public TurboAssembler {
// Volatile registers are registers which are used for caching values that
// can easily be reloaded. Those are returned first if we run out of free
// registers.
// Note: This interface is a bit more generic than currently needed, in
// anticipation of more "volatile registers" being added later.
bool has_volatile_register(LiftoffRegList candidates) {
return cached_instance != no_reg && candidates.has(cached_instance);
return (cached_instance != no_reg && candidates.has(cached_instance)) ||
(cached_mem_start != no_reg && candidates.has(cached_mem_start));
}
LiftoffRegister take_volatile_register(LiftoffRegList candidates) {
DCHECK(candidates.has(cached_instance));
LiftoffRegister ret{cached_instance};
DCHECK(has_volatile_register(candidates));
Register reg = no_reg;
if (cached_instance != no_reg && candidates.has(cached_instance)) {
reg = cached_instance;
cached_instance = no_reg;
} else {
DCHECK(candidates.has(cached_mem_start));
reg = cached_mem_start;
cached_mem_start = no_reg;
}
LiftoffRegister ret{reg};
DCHECK_EQ(1, register_use_count[ret.liftoff_code()]);
register_use_count[ret.liftoff_code()] = 0;
used_registers.clear(ret);
cached_instance = no_reg;
return ret;
}
void SetInstanceCacheRegister(Register reg) {
DCHECK_EQ(no_reg, cached_instance);
cached_instance = reg;
void SetCacheRegister(Register* cache, Register reg) {
DCHECK_EQ(no_reg, *cache);
*cache = reg;
int liftoff_code = LiftoffRegister{reg}.liftoff_code();
DCHECK_EQ(0, register_use_count[liftoff_code]);
register_use_count[liftoff_code] = 1;
used_registers.set(reg);
}
void SetInstanceCacheRegister(Register reg) {
SetCacheRegister(&cached_instance, reg);
}
void SetMemStartCacheRegister(Register reg) {
SetCacheRegister(&cached_mem_start, reg);
}
Register TrySetCachedInstanceRegister(LiftoffRegList pinned) {
DCHECK_EQ(no_reg, cached_instance);
LiftoffRegList available_regs =
@ -290,13 +307,24 @@ class LiftoffAssembler : public TurboAssembler {
return new_cache_reg;
}
void ClearCachedInstanceRegister() {
if (cached_instance == no_reg) return;
int liftoff_code = LiftoffRegister{cached_instance}.liftoff_code();
void ClearCacheRegister(Register* cache) {
if (*cache == no_reg) return;
int liftoff_code = LiftoffRegister{*cache}.liftoff_code();
DCHECK_EQ(1, register_use_count[liftoff_code]);
register_use_count[liftoff_code] = 0;
used_registers.clear(cached_instance);
cached_instance = no_reg;
used_registers.clear(*cache);
*cache = no_reg;
}
void ClearCachedInstanceRegister() { ClearCacheRegister(&cached_instance); }
void ClearCachedMemStartRegister() {
ClearCacheRegister(&cached_mem_start);
}
void ClearAllCacheRegisters() {
ClearCacheRegister(&cached_instance);
ClearCacheRegister(&cached_mem_start);
}
void inc_used(LiftoffRegister reg) {
@ -551,6 +579,8 @@ class LiftoffAssembler : public TurboAssembler {
if (cache_state_.is_free(r)) continue;
if (r.is_gp() && cache_state_.cached_instance == r.gp()) {
cache_state_.ClearCachedInstanceRegister();
} else if (r.is_gp() && cache_state_.cached_mem_start == r.gp()) {
cache_state_.ClearCachedMemStartRegister();
} else {
SpillRegister(r);
}

View File

@ -2817,6 +2817,17 @@ class LiftoffCompiler {
return true;
}
Register GetMemoryStart(LiftoffRegList pinned) {
Register memory_start = __ cache_state()->cached_mem_start;
if (memory_start == no_reg) {
memory_start = __ GetUnusedRegister(kGpReg, pinned).gp();
LOAD_INSTANCE_FIELD(memory_start, MemoryStart, kSystemPointerSize,
pinned);
__ cache_state()->SetMemStartCacheRegister(memory_start);
}
return memory_start;
}
void LoadMem(FullDecoder* decoder, LoadType type,
const MemoryAccessImmediate<validate>& imm,
const Value& index_val, Value* result) {
@ -2835,8 +2846,7 @@ class LiftoffCompiler {
__ cache_state()->stack_state.pop_back();
DEBUG_CODE_COMMENT("load from memory (constant offset)");
LiftoffRegList pinned;
Register mem = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(mem, MemoryStart, kSystemPointerSize, pinned);
Register mem = pinned.set(GetMemoryStart(pinned));
LiftoffRegister value = pinned.set(__ GetUnusedRegister(rc, pinned));
__ Load(value, mem, no_reg, offset, type, pinned, nullptr, true,
i64_offset);
@ -2853,8 +2863,7 @@ class LiftoffCompiler {
// Load the memory start address only now to reduce register pressure
// (important on ia32).
Register mem = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(mem, MemoryStart, kSystemPointerSize, pinned);
Register mem = pinned.set(GetMemoryStart(pinned));
LiftoffRegister value = pinned.set(__ GetUnusedRegister(rc, pinned));
uint32_t protected_load_pc = 0;
@ -2897,8 +2906,7 @@ class LiftoffCompiler {
LiftoffRegList pinned = LiftoffRegList::ForRegs(index);
index = AddMemoryMasking(index, &offset, &pinned);
DEBUG_CODE_COMMENT("load with transformation");
Register addr = __ GetUnusedRegister(kGpReg, pinned).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = GetMemoryStart(pinned);
LiftoffRegister value = __ GetUnusedRegister(reg_class_for(kS128), {});
uint32_t protected_load_pc = 0;
__ LoadTransform(value, addr, index, offset, type, transform,
@ -2938,8 +2946,7 @@ class LiftoffCompiler {
pinned.set(index);
index = AddMemoryMasking(index, &offset, &pinned);
DEBUG_CODE_COMMENT("load lane");
Register addr = __ GetUnusedRegister(kGpReg, pinned).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = GetMemoryStart(pinned);
LiftoffRegister result = __ GetUnusedRegister(reg_class_for(kS128), {});
uint32_t protected_load_pc = 0;
@ -2974,8 +2981,7 @@ class LiftoffCompiler {
if (IndexStaticallyInBounds(index_slot, type.size(), &offset)) {
__ cache_state()->stack_state.pop_back();
DEBUG_CODE_COMMENT("store to memory (constant offset)");
Register mem = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(mem, MemoryStart, kSystemPointerSize, pinned);
Register mem = pinned.set(GetMemoryStart(pinned));
__ Store(mem, no_reg, offset, value, type, pinned, nullptr, true);
} else {
LiftoffRegister full_index = __ PopToRegister(pinned);
@ -2989,8 +2995,7 @@ class LiftoffCompiler {
uint32_t protected_store_pc = 0;
// Load the memory start address only now to reduce register pressure
// (important on ia32).
Register mem = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(mem, MemoryStart, kSystemPointerSize, pinned);
Register mem = pinned.set(GetMemoryStart(pinned));
LiftoffRegList outer_pinned;
if (V8_UNLIKELY(FLAG_trace_wasm_memory)) outer_pinned.set(index);
__ Store(mem, index, offset, value, type, outer_pinned,
@ -3022,8 +3027,7 @@ class LiftoffCompiler {
pinned.set(index);
index = AddMemoryMasking(index, &offset, &pinned);
DEBUG_CODE_COMMENT("store lane to memory");
Register addr = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = pinned.set(GetMemoryStart(pinned));
uint32_t protected_store_pc = 0;
__ StoreLane(addr, index, offset, value, type, lane, &protected_store_pc);
if (env_->use_trap_handler) {
@ -4271,8 +4275,7 @@ class LiftoffCompiler {
uintptr_t offset = imm.offset;
index = AddMemoryMasking(index, &offset, &pinned);
DEBUG_CODE_COMMENT("atomic store to memory");
Register addr = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = pinned.set(GetMemoryStart(pinned));
LiftoffRegList outer_pinned;
if (V8_UNLIKELY(FLAG_trace_wasm_memory)) outer_pinned.set(index);
__ AtomicStore(addr, index, offset, value, type, outer_pinned);
@ -4295,8 +4298,7 @@ class LiftoffCompiler {
uintptr_t offset = imm.offset;
index = AddMemoryMasking(index, &offset, &pinned);
DEBUG_CODE_COMMENT("atomic load from memory");
Register addr = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = pinned.set(GetMemoryStart(pinned));
RegClass rc = reg_class_for(kind);
LiftoffRegister value = pinned.set(__ GetUnusedRegister(rc, pinned));
__ AtomicLoad(value, addr, index, offset, type, pinned);
@ -4343,8 +4345,7 @@ class LiftoffCompiler {
uintptr_t offset = imm.offset;
index = AddMemoryMasking(index, &offset, &pinned);
Register addr = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = pinned.set(GetMemoryStart(pinned));
(asm_.*emit_fn)(addr, index, offset, value, result, type);
__ PushRegister(result_kind, result);
@ -4400,8 +4401,7 @@ class LiftoffCompiler {
uintptr_t offset = imm.offset;
index = AddMemoryMasking(index, &offset, &pinned);
Register addr = pinned.set(__ GetUnusedRegister(kGpReg, pinned)).gp();
LOAD_INSTANCE_FIELD(addr, MemoryStart, kSystemPointerSize, pinned);
Register addr = pinned.set(GetMemoryStart(pinned));
LiftoffRegister result =
pinned.set(__ GetUnusedRegister(reg_class_for(result_kind), pinned));