[liftoff] Change FillStackSlotsWithZero to use bytes
Bug: v8:9909 Change-Id: I997ae6f19c580f08eb9ff8ee039e0dd647091616 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1947350 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Clemens Backes <clemensb@chromium.org> Cr-Commit-Position: refs/heads/master@{#65320}
This commit is contained in:
parent
0db45cb1d2
commit
785fa6b412
@ -1940,6 +1940,12 @@ void Assembler::emit_repmovs(int size) {
|
||||
emit(0xA5);
|
||||
}
|
||||
|
||||
void Assembler::repstosl() {
|
||||
EnsureSpace ensure_space(this);
|
||||
emit(0xF3);
|
||||
emit(0xAB);
|
||||
}
|
||||
|
||||
void Assembler::repstosq() {
|
||||
EnsureSpace ensure_space(this);
|
||||
emit(0xF3);
|
||||
|
@ -544,6 +544,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
|
||||
void repmovsl() { emit_repmovs(kInt32Size); }
|
||||
void repmovsq() { emit_repmovs(kInt64Size); }
|
||||
|
||||
// Repeated store of doublewords (fill (E)CX bytes at ES:[(E)DI] with EAX).
|
||||
void repstosl();
|
||||
// Repeated store of quadwords (fill RCX quadwords at [RDI] with RAX).
|
||||
void repstosq();
|
||||
|
||||
|
@ -640,36 +640,29 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset,
|
||||
ldr(reg, liftoff::GetHalfStackSlot(offset, half));
|
||||
}
|
||||
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
|
||||
DCHECK_LT(0, count);
|
||||
uint32_t last_stack_slot = index + count - 1;
|
||||
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
|
||||
DCHECK_LT(0, size);
|
||||
DCHECK_EQ(0, size % 4);
|
||||
RecordUsedSpillOffset(start + size);
|
||||
|
||||
// We need a zero reg. Always use r0 for that, and push it before to restore
|
||||
// its value afterwards.
|
||||
push(r0);
|
||||
mov(r0, Operand(0));
|
||||
|
||||
if (count <= 5) {
|
||||
// Special straight-line code for up to five slots. Generates two
|
||||
// instructions per slot.
|
||||
for (uint32_t offset = 0; offset < count; ++offset) {
|
||||
str(r0, liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset),
|
||||
kLowWord));
|
||||
str(r0, liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset),
|
||||
kHighWord));
|
||||
if (size <= 36) {
|
||||
// Special straight-line code for up to 9 words. Generates one
|
||||
// instruction per word.
|
||||
for (uint32_t offset = 4; offset <= size; offset += 4) {
|
||||
str(r0, liftoff::GetHalfStackSlot(start + offset, kLowWord));
|
||||
}
|
||||
} else {
|
||||
// General case for bigger counts (9 instructions).
|
||||
// Use r1 for start address (inclusive), r2 for end address (exclusive).
|
||||
push(r1);
|
||||
push(r2);
|
||||
sub(r1, fp,
|
||||
Operand(liftoff::GetStackSlotOffset(
|
||||
GetStackOffsetFromIndex(last_stack_slot))));
|
||||
sub(r2, fp,
|
||||
Operand(liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) -
|
||||
kStackSlotSize));
|
||||
sub(r1, fp, Operand(liftoff::GetStackSlotOffset(start + size)));
|
||||
sub(r2, fp, Operand(liftoff::GetStackSlotOffset(start)));
|
||||
|
||||
Label loop;
|
||||
bind(&loop);
|
||||
|
@ -398,39 +398,52 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
|
||||
DCHECK_LT(0, count);
|
||||
uint32_t last_stack_slot = index + count - 1;
|
||||
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
|
||||
DCHECK_LT(0, size);
|
||||
DCHECK_EQ(0, size % 4);
|
||||
RecordUsedSpillOffset(start + size);
|
||||
|
||||
int max_stp_offset =
|
||||
-liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index + count - 1));
|
||||
if (count <= 12 && IsImmLSPair(max_stp_offset, kXRegSizeLog2)) {
|
||||
int max_stp_offset = -liftoff::GetStackSlotOffset(start + size);
|
||||
if (size <= 12 * kStackSlotSize &&
|
||||
IsImmLSPair(max_stp_offset, kXRegSizeLog2)) {
|
||||
// Special straight-line code for up to 12 slots. Generates one
|
||||
// instruction per two slots (<= 6 instructions total).
|
||||
for (; count > 1; count -= 2) {
|
||||
STATIC_ASSERT(kStackSlotSize == kSystemPointerSize);
|
||||
stp(xzr, xzr,
|
||||
liftoff::GetStackSlot(GetStackOffsetFromIndex(index + count - 1)));
|
||||
// instruction per two slots (<= 7 instructions total).
|
||||
STATIC_ASSERT(kStackSlotSize == kSystemPointerSize);
|
||||
uint32_t remainder = size;
|
||||
for (; remainder >= 2 * kStackSlotSize; remainder -= 2 * kStackSlotSize) {
|
||||
stp(xzr, xzr, liftoff::GetStackSlot(start + remainder));
|
||||
}
|
||||
DCHECK(count == 0 || count == 1);
|
||||
if (count) {
|
||||
str(xzr, liftoff::GetStackSlot(GetStackOffsetFromIndex(index)));
|
||||
|
||||
DCHECK_GE(12, remainder);
|
||||
switch (remainder) {
|
||||
case 12:
|
||||
str(xzr, liftoff::GetStackSlot(start + remainder));
|
||||
strh(xzr, liftoff::GetStackSlot(start + remainder - 8));
|
||||
break;
|
||||
case 8:
|
||||
str(xzr, liftoff::GetStackSlot(start + remainder));
|
||||
break;
|
||||
case 4:
|
||||
strh(xzr, liftoff::GetStackSlot(start + remainder));
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
}
|
||||
} else {
|
||||
// General case for bigger counts (5-8 instructions).
|
||||
UseScratchRegisterScope temps(this);
|
||||
Register address_reg = temps.AcquireX();
|
||||
// This {Sub} might use another temp register if the offset is too large.
|
||||
Sub(address_reg, fp,
|
||||
liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(last_stack_slot)));
|
||||
Sub(address_reg, fp, liftoff::GetStackSlotOffset(start + size));
|
||||
Register count_reg = temps.AcquireX();
|
||||
Mov(count_reg, count);
|
||||
Mov(count_reg, size / 4);
|
||||
|
||||
Label loop;
|
||||
bind(&loop);
|
||||
sub(count_reg, count_reg, 1);
|
||||
str(xzr, MemOperand(address_reg, kSystemPointerSize, PostIndex));
|
||||
strh(xzr, MemOperand(address_reg, kSystemPointerSize, PostIndex));
|
||||
cbnz(count_reg, &loop);
|
||||
}
|
||||
}
|
||||
|
@ -503,21 +503,16 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset,
|
||||
mov(reg, liftoff::GetHalfStackSlot(offset, half));
|
||||
}
|
||||
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
|
||||
DCHECK_LT(0, count);
|
||||
uint32_t last_stack_slot = index + count - 1;
|
||||
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
|
||||
DCHECK_LT(0, size);
|
||||
DCHECK_EQ(0, size % 4);
|
||||
RecordUsedSpillOffset(start + size);
|
||||
|
||||
if (count <= 2) {
|
||||
// Special straight-line code for up to two slots (6-9 bytes per word:
|
||||
// C7 <1-4 bytes operand> <4 bytes imm>, makes 12-18 bytes per slot).
|
||||
for (uint32_t offset = 0; offset < count; ++offset) {
|
||||
mov(liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset),
|
||||
kLowWord),
|
||||
Immediate(0));
|
||||
mov(liftoff::GetHalfStackSlot(GetStackOffsetFromIndex(index + offset),
|
||||
kHighWord),
|
||||
Immediate(0));
|
||||
if (size <= 12) {
|
||||
// Special straight-line code for up to three words (6-9 bytes per word:
|
||||
// C7 <1-4 bytes operand> <4 bytes imm>, makes 18-27 bytes total).
|
||||
for (uint32_t offset = 4; offset <= size; offset += 4) {
|
||||
mov(liftoff::GetHalfStackSlot(start + offset, kLowWord), Immediate(0));
|
||||
}
|
||||
} else {
|
||||
// General case for bigger counts.
|
||||
@ -527,10 +522,10 @@ void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
|
||||
push(eax);
|
||||
push(ecx);
|
||||
push(edi);
|
||||
lea(edi, liftoff::GetStackSlot(GetStackOffsetFromIndex(last_stack_slot)));
|
||||
lea(edi, liftoff::GetStackSlot(start + size));
|
||||
xor_(eax, eax);
|
||||
// Number of words is number of slots times two.
|
||||
mov(ecx, Immediate(count * 2));
|
||||
// Size is in bytes, convert to doublewords (4-bytes).
|
||||
mov(ecx, Immediate(size / 4));
|
||||
rep_stos();
|
||||
pop(edi);
|
||||
pop(ecx);
|
||||
|
@ -286,12 +286,14 @@ class LiftoffAssembler : public TurboAssembler {
|
||||
LiftoffRegister PopToRegister(LiftoffRegList pinned = {});
|
||||
|
||||
uint32_t NextSpillOffset(ValueType type) {
|
||||
return TopSpillOffset() + SlotSizeForType(type);
|
||||
}
|
||||
|
||||
uint32_t TopSpillOffset() {
|
||||
if (cache_state_.stack_state.empty()) {
|
||||
return SlotSizeForType(type);
|
||||
return 0;
|
||||
}
|
||||
VarState last = cache_state_.stack_state.back();
|
||||
uint32_t offset = last.offset() + SlotSizeForType(type);
|
||||
return offset;
|
||||
return cache_state_.stack_state.back().offset();
|
||||
}
|
||||
|
||||
void PushRegister(ValueType type, LiftoffRegister reg) {
|
||||
|
@ -478,16 +478,18 @@ class LiftoffCompiler {
|
||||
for (uint32_t param_idx = 0; param_idx < num_params; ++param_idx) {
|
||||
input_idx += ProcessParameter(__ local_type(param_idx), input_idx);
|
||||
}
|
||||
uint32_t params_size = __ TopSpillOffset();
|
||||
DCHECK_EQ(input_idx, descriptor_->InputCount());
|
||||
|
||||
// Initialize locals beyond parameters.
|
||||
if (SpillLocalsInitially(decoder, num_params)) {
|
||||
__ FillStackSlotsWithZero(num_params, __ num_locals() - num_params);
|
||||
for (uint32_t param_idx = num_params; param_idx < __ num_locals();
|
||||
++param_idx) {
|
||||
ValueType type = decoder->GetLocalType(param_idx);
|
||||
__ PushStack(type);
|
||||
}
|
||||
uint32_t spill_size = __ TopSpillOffset();
|
||||
__ FillStackSlotsWithZero(params_size, spill_size);
|
||||
} else {
|
||||
for (uint32_t param_idx = num_params; param_idx < __ num_locals();
|
||||
++param_idx) {
|
||||
|
@ -616,28 +616,23 @@ void LiftoffAssembler::FillI64Half(Register reg, uint32_t offset,
|
||||
lw(reg, liftoff::GetHalfStackSlot(offset, half));
|
||||
}
|
||||
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
|
||||
DCHECK_LT(0, count);
|
||||
uint32_t last_stack_slot = index + count - 1;
|
||||
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
|
||||
DCHECK_LT(0, size);
|
||||
DCHECK_EQ(0, size % 4);
|
||||
RecordUsedSpillOffset(start + size);
|
||||
|
||||
if (count <= 12) {
|
||||
// Special straight-line code for up to 12 slots. Generates one
|
||||
// instruction per slot (<=12 instructions total).
|
||||
for (uint32_t offset = 0; offset < count; ++offset) {
|
||||
Sw(zero_reg,
|
||||
liftoff::GetStackSlot(GetStackOffsetFromIndex(index + offset)));
|
||||
if (size <= 48) {
|
||||
// Special straight-line code for up to 12 words. Generates one
|
||||
// instruction per word (<=12 instructions total).
|
||||
for (uint32_t offset = 4; offset <= size; offset += 4) {
|
||||
Sw(zero_reg, liftoff::GetStackSlot(start + offset));
|
||||
}
|
||||
} else {
|
||||
// General case for bigger counts (12 instructions).
|
||||
// Use a0 for start address (inclusive), a1 for end address (exclusive).
|
||||
Push(a1, a0);
|
||||
Addu(a0, fp,
|
||||
Operand(-liftoff::GetStackSlotOffset(
|
||||
GetStackOffsetFromIndex(last_stack_slot))));
|
||||
Addu(a1, fp,
|
||||
Operand(-liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) +
|
||||
kStackSlotSize));
|
||||
Addu(a0, fp, Operand(-liftoff::GetStackSlotOffset(start + size)));
|
||||
Addu(a1, fp, Operand(-liftoff::GetStackSlotOffset(start)));
|
||||
|
||||
Label loop;
|
||||
bind(&loop);
|
||||
|
@ -524,28 +524,27 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
|
||||
DCHECK_LT(0, count);
|
||||
uint32_t last_stack_slot = index + count - 1;
|
||||
RecordUsedSpillOffset(GetStackOffsetFromIndex(last_stack_slot));
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
|
||||
DCHECK_LT(0, size);
|
||||
RecordUsedSpillOffset(start + size);
|
||||
|
||||
if (count <= 12) {
|
||||
if (size <= 12 * kStackSlotSize) {
|
||||
// Special straight-line code for up to 12 slots. Generates one
|
||||
// instruction per slot (<= 12 instructions total).
|
||||
for (uint32_t offset = 0; offset < count; ++offset) {
|
||||
Sd(zero_reg,
|
||||
liftoff::GetStackSlot(GetStackOffsetFromIndex(index + offset)));
|
||||
uint32_t remainder = size;
|
||||
for (; remainder >= kStackSlotSize; remainder -= kStackSlotSize) {
|
||||
Sd(zero_reg, liftoff::GetStackSlot(start + remainder));
|
||||
}
|
||||
DCHECK(remainder == 4 || remainder == 0);
|
||||
if (remainder) {
|
||||
Sw(zero_reg, liftoff::GetStackSlot(start + remainder));
|
||||
}
|
||||
} else {
|
||||
// General case for bigger counts (12 instructions).
|
||||
// Use a0 for start address (inclusive), a1 for end address (exclusive).
|
||||
Push(a1, a0);
|
||||
Daddu(a0, fp,
|
||||
Operand(-liftoff::GetStackSlotOffset(
|
||||
GetStackOffsetFromIndex(last_stack_slot))));
|
||||
Daddu(a1, fp,
|
||||
Operand(-liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) +
|
||||
kStackSlotSize));
|
||||
Daddu(a0, fp, Operand(-liftoff::GetStackSlotOffset(start + end)));
|
||||
Daddu(a1, fp, Operand(-liftoff::GetStackSlotOffset(start)));
|
||||
|
||||
Label loop;
|
||||
bind(&loop);
|
||||
|
@ -145,36 +145,34 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
|
||||
bailout(kUnsupportedArchitecture, "FillI64Half");
|
||||
}
|
||||
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
|
||||
DCHECK_LT(0, count);
|
||||
uint32_t last_stack_slot = index + count - 1;
|
||||
RecordUsedSpillOffset(last_stack_slot);
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
|
||||
DCHECK_LT(0, size);
|
||||
RecordUsedSpillOffset(start + size);
|
||||
|
||||
// We need a zero reg. Always use r0 for that, and push it before to restore
|
||||
// its value afterwards.
|
||||
push(r0);
|
||||
mov(r0, Operand(0));
|
||||
|
||||
if (count <= 5) {
|
||||
if (size <= 5 * kStackSlotSize) {
|
||||
// Special straight-line code for up to five slots. Generates two
|
||||
// instructions per slot.
|
||||
for (uint32_t offset = 0; offset < count; ++offset) {
|
||||
StoreP(r0, liftoff::GetHalfStackSlot(
|
||||
GetStackOffsetFromIndex(index + offset), kLowWord));
|
||||
StoreP(r0, liftoff::GetHalfStackSlot(
|
||||
GetStackOffsetFromIndex(index + offset), kHighWord));
|
||||
uint32_t remainder = size;
|
||||
for (; remainder >= kStackSlotSize; remainder -= kStackSlotSize) {
|
||||
StoreP(r0, liftoff::GetHalfStackSlot(start + remainder, kLowWord));
|
||||
StoreP(r0, liftoff::GetHalfStackSlot(start + remainder, kHighWord));
|
||||
}
|
||||
DCHECK(remainder == 4 || remainder == 0);
|
||||
if (remainder) {
|
||||
StoreP(r0, liftoff::GetHalfStackSlot(start + remainder, kLowWord));
|
||||
}
|
||||
} else {
|
||||
// General case for bigger counts (9 instructions).
|
||||
// Use r3 for start address (inclusive), r4 for end address (exclusive).
|
||||
push(r3);
|
||||
push(r4);
|
||||
SubP(r3, fp,
|
||||
Operand(liftoff::GetStackSlotOffset(
|
||||
GetStackOffsetFromIndex(last_stack_slot))));
|
||||
SubP(r4, fp,
|
||||
Operand(liftoff::GetStackSlotOffset(GetStackOffsetFromIndex(index)) -
|
||||
kStackSlotSize));
|
||||
SubP(r3, fp, Operand(liftoff::GetStackSlotOffset(start + remainder)));
|
||||
SubP(r4, fp, Operand(liftoff::GetStackSlotOffset(start)));
|
||||
|
||||
Label loop;
|
||||
bind(&loop);
|
||||
|
@ -445,31 +445,34 @@ void LiftoffAssembler::FillI64Half(Register, uint32_t offset, RegPairHalf) {
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t index, uint32_t count) {
|
||||
DCHECK_LT(0, count);
|
||||
uint32_t last_stack_slot = index + count - 1;
|
||||
RecordUsedSpillOffset(
|
||||
LiftoffAssembler::GetStackOffsetFromIndex(last_stack_slot));
|
||||
void LiftoffAssembler::FillStackSlotsWithZero(uint32_t start, uint32_t size) {
|
||||
DCHECK_LT(0, size);
|
||||
RecordUsedSpillOffset(start + size);
|
||||
|
||||
if (count <= 3) {
|
||||
if (size <= 3 * kStackSlotSize) {
|
||||
// Special straight-line code for up to three slots
|
||||
// (7-10 bytes per slot: REX C7 <1-4 bytes op> <4 bytes imm>).
|
||||
for (uint32_t offset = 0; offset < count; ++offset) {
|
||||
movq(liftoff::GetStackSlot(GetStackOffsetFromIndex(index + offset)),
|
||||
Immediate(0));
|
||||
// (7-10 bytes per slot: REX C7 <1-4 bytes op> <4 bytes imm>),
|
||||
// And a movd (6-9 byte) when size % 8 != 0;
|
||||
uint32_t remainder = size;
|
||||
for (; remainder >= kStackSlotSize; remainder -= kStackSlotSize) {
|
||||
movq(liftoff::GetStackSlot(start + remainder), Immediate(0));
|
||||
}
|
||||
DCHECK(remainder == 4 || remainder == 0);
|
||||
if (remainder) {
|
||||
movl(liftoff::GetStackSlot(start + remainder), Immediate(0));
|
||||
}
|
||||
} else {
|
||||
// General case for bigger counts.
|
||||
// This sequence takes 20-23 bytes (3 for pushes, 4-7 for lea, 2 for xor, 5
|
||||
// for mov, 3 for repstosq, 3 for pops).
|
||||
// From intel manual: repstosq fills RCX quadwords at [RDI] with RAX.
|
||||
// This sequence takes 19-22 bytes (3 for pushes, 4-7 for lea, 2 for xor, 5
|
||||
// for mov, 2 for repstosl, 3 for pops).
|
||||
pushq(rax);
|
||||
pushq(rcx);
|
||||
pushq(rdi);
|
||||
leaq(rdi, liftoff::GetStackSlot(GetStackOffsetFromIndex(last_stack_slot)));
|
||||
leaq(rdi, liftoff::GetStackSlot(start + size));
|
||||
xorl(rax, rax);
|
||||
movl(rcx, Immediate(count));
|
||||
repstosq();
|
||||
// Convert size (bytes) to doublewords (4-bytes).
|
||||
movl(rcx, Immediate(size / 4));
|
||||
repstosl();
|
||||
popq(rdi);
|
||||
popq(rcx);
|
||||
popq(rax);
|
||||
|
@ -182,6 +182,7 @@ TEST(DisasmX64) {
|
||||
__ decq(rdx);
|
||||
__ cdq();
|
||||
|
||||
__ repstosl();
|
||||
__ repstosq();
|
||||
|
||||
__ nop();
|
||||
|
Loading…
Reference in New Issue
Block a user