PPC/s390: [Liftoff] Improve initialization for many locals

Port a8cdda9947

Original Commit Message:

    WebAssembly locals are specified to be zero on function entry. Liftoff
    implements this by just storing the constant 0 in the virtual stack for
    integer types, and using one floating point register initialized to
    zero for all floating point types.
    For big counts of locals this leads to problems (manifesting as huge
    blocks of code being generated) once we hit a merge point: All those
    constants (for int) and all duplicate register uses (for floats) need to
    be fixed up, by using separate registers for the locals or spilling to
    the stack if no more registers are available. All this spilling
    generates a lot of code, and can even happen multiple times within a
    function.

    This CL optimizes for such cases by spilling all locals to the stack
    initially. All merges within the function body get much smaller then.
    The spilled values rarely have to be loaded anyway, because the initial
    zero value is usually overwritten before the first use.

    To optimize the code size for initializing big numbers of locals on the
    stack, this CL also introduces the platform-specific
    {FillStackSlotsWithZero} method which uses a loop for bigger local
    counts.

    This often saves dozens of kilobytes for very big functions, and shows
    an overall code size reduction of 4-5 percent for big modules.

R=clemensb@chromium.org, joransiu@ca.ibm.com, jyan@ca.ibm.com, michael_dawson@ca.ibm.com
BUG=
LOG=N

Change-Id: I2459080a1f6acfdd212e9a93a868d028980c5554
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1863370
Reviewed-by: Junliang Yan <jyan@ca.ibm.com>
Reviewed-by: Milad Farazmand <miladfar@ca.ibm.com>
Commit-Queue: Milad Farazmand <miladfar@ca.ibm.com>
Cr-Commit-Position: refs/heads/master@{#64301}
This commit is contained in:
Milad Farazmand 2019-10-15 13:04:36 -04:00 committed by Commit Bot
parent e359c49b9d
commit c314cf7466
2 changed files with 163 additions and 0 deletions

View File

@ -12,6 +12,49 @@ namespace v8 {
namespace internal {
namespace wasm {
namespace liftoff {
// half
// slot Frame
// -----+--------------------+---------------------------
// n+3 | parameter n |
// ... | ... |
// 4 | parameter 1 | or parameter 2
// 3 | parameter 0 | or parameter 1
// 2 | (result address) | or parameter 0
// -----+--------------------+---------------------------
// 1 | return addr (lr) |
// 0 | previous frame (fp)|
// -----+--------------------+ <-- frame ptr (fp)
// -1 | 0xa: WASM_COMPILED |
// -2 | instance |
// -----+--------------------+---------------------------
// -3 | slot 0 (high) | ^
// -4 | slot 0 (low) | |
// -5 | slot 1 (high) | Frame slots
// -6 | slot 1 (low) | |
// | | v
// -----+--------------------+ <-- stack ptr (sp)
//
constexpr int32_t kInstanceOffset = 2 * kSystemPointerSize;
constexpr int32_t kFirstStackSlotOffset =
kInstanceOffset + 2 * kSystemPointerSize;
inline int GetStackSlotOffset(uint32_t index) {
return kFirstStackSlotOffset + index * LiftoffAssembler::kStackSlotSize;
}
inline MemOperand GetHalfStackSlot(uint32_t index, RegPairHalf half) {
int32_t half_offset =
half == kLowWord ? 0 : LiftoffAssembler::kStackSlotSize / 2;
int32_t offset = kFirstStackSlotOffset +
index * LiftoffAssembler::kStackSlotSize - half_offset;
return MemOperand(fp, -offset);
}
} // namespace liftoff
int LiftoffAssembler::PrepareStackFrame() {
bailout(kUnsupportedArchitecture, "PrepareStackFrame");
return 0;
@ -108,6 +151,45 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t index, RegPairHalf) {
bailout(kUnsupportedArchitecture, "FillI64Half");
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillSlot(last_stack_slot);
// We need a zero reg. Always use r0 for that, and push it before to restore
// its value afterwards.
push(r0);
mov(r0, Operand(0));
if (count <= 5) {
// Special straight-line code for up to five slots. Generates two
// instructions per slot.
for (uint32_t offset = 0; offset < count; ++offset) {
StoreP(r0, liftoff::GetHalfStackSlot(index + offset, kLowWord));
StoreP(r0, liftoff::GetHalfStackSlot(index + offset, kHighWord));
}
} else {
// General case for bigger counts (9 instructions).
// Use r4 for start address (inclusive), r5 for end address (exclusive).
push(r4);
push(r5);
subi(r4, fp, Operand(liftoff::GetStackSlotOffset(last_stack_slot)));
subi(r5, fp, Operand(liftoff::GetStackSlotOffset(index) + kStackSlotSize));
Label loop;
bind(&loop);
StoreP(r0, MemOperand(r0));
addi(r0, r0, Operand(kSystemPointerSize));
cmp(r4, r5);
bne(&loop);
pop(r4);
pop(r5);
}
pop(r0);
}
#define UNIMPLEMENTED_I32_BINOP(name) \
void LiftoffAssembler::emit_##name(Register dst, Register lhs, \
Register rhs) { \

View File

@ -12,6 +12,48 @@ namespace v8 {
namespace internal {
namespace wasm {
namespace liftoff {
// half
// slot Frame
// -----+--------------------+---------------------------
// n+3 | parameter n |
// ... | ... |
// 4 | parameter 1 | or parameter 2
// 3 | parameter 0 | or parameter 1
// 2 | (result address) | or parameter 0
// -----+--------------------+---------------------------
// 1 | return addr (lr) |
// 0 | previous frame (fp)|
// -----+--------------------+ <-- frame ptr (fp)
// -1 | 0xa: WASM_COMPILED |
// -2 | instance |
// -----+--------------------+---------------------------
// -3 | slot 0 (high) | ^
// -4 | slot 0 (low) | |
// -5 | slot 1 (high) | Frame slots
// -6 | slot 1 (low) | |
// | | v
// -----+--------------------+ <-- stack ptr (sp)
//
constexpr int32_t kInstanceOffset = 2 * kSystemPointerSize;
constexpr int32_t kFirstStackSlotOffset =
kInstanceOffset + 2 * kSystemPointerSize;
inline int GetStackSlotOffset(uint32_t index) {
return kFirstStackSlotOffset + index * LiftoffAssembler::kStackSlotSize;
}
inline MemOperand GetHalfStackSlot(uint32_t index, RegPairHalf half) {
int32_t half_offset =
half == kLowWord ? 0 : LiftoffAssembler::kStackSlotSize / 2;
int32_t offset = kFirstStackSlotOffset +
index * LiftoffAssembler::kStackSlotSize - half_offset;
return MemOperand(fp, -offset);
}
} // namespace liftoff
int LiftoffAssembler::PrepareStackFrame() {
bailout(kUnsupportedArchitecture, "PrepareStackFrame");
return 0;
@ -108,6 +150,45 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t index, RegPairHalf) {
bailout(kUnsupportedArchitecture, "FillI64Half");
}
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
DCHECK_LT(0, count);
uint32_t last_stack_slot = index + count - 1;
RecordUsedSpillSlot(last_stack_slot);
// We need a zero reg. Always use r0 for that, and push it before to restore
// its value afterwards.
push(r0);
mov(r0, Operand(0));
if (count <= 5) {
// Special straight-line code for up to five slots. Generates two
// instructions per slot.
for (uint32_t offset = 0; offset < count; ++offset) {
StoreP(r0, liftoff::GetHalfStackSlot(index + offset, kLowWord));
StoreP(r0, liftoff::GetHalfStackSlot(index + offset, kHighWord));
}
} else {
// General case for bigger counts (9 instructions).
// Use r3 for start address (inclusive), r4 for end address (exclusive).
push(r3);
push(r4);
SubP(r3, fp, Operand(liftoff::GetStackSlotOffset(last_stack_slot)));
SubP(r4, fp, Operand(liftoff::GetStackSlotOffset(index) + kStackSlotSize));
Label loop;
bind(&loop);
StoreP(r0, MemOperand(r0));
la(r0, MemOperand(r0, kSystemPointerSize));
CmpLogicalP(r3, r4);
bne(&loop);
pop(r4);
pop(r3);
}
pop(r0);
}
#define UNIMPLEMENTED_I32_BINOP(name) \
void LiftoffAssembler::emit_##name(Register dst, Register lhs, \
Register rhs) { \