PPC [liftoff]: optimize unsigned byte reverse ops

This CL optimizes ByteReverse 16/32/64 ops on PPC<10 as
well as PPC_10_PLUS.

A 32bit sign extension is also added to `ByteRev32` in codegen.

Change-Id: I6379ac4222f3574ab226971546238142039fe977
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3298308
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Reviewed-by: Junliang Yan <junyan@redhat.com>
Cr-Commit-Position: refs/heads/main@{#78048}
This commit is contained in:
Milad Fa 2021-11-23 10:15:08 -05:00 committed by V8 LUCI CQ
parent 50c808d4b3
commit 407922fad8
4 changed files with 44 additions and 27 deletions

View File

@ -3579,21 +3579,37 @@ void TurboAssembler::SwapSimd128(MemOperand src, MemOperand dst,
addi(sp, sp, Operand(2 * kSimd128Size));
}
void TurboAssembler::ByteReverseU16(Register dst, Register val) {
subi(sp, sp, Operand(kSystemPointerSize));
sth(val, MemOperand(sp));
lhbrx(dst, MemOperand(r0, sp));
addi(sp, sp, Operand(kSystemPointerSize));
void TurboAssembler::ByteReverseU16(Register dst, Register val,
Register scratch) {
if (CpuFeatures::IsSupported(PPC_10_PLUS)) {
brh(dst, val);
ZeroExtHalfWord(dst, dst);
return;
}
rlwinm(scratch, val, 8, 16, 23);
rlwinm(dst, val, 24, 24, 31);
orx(dst, scratch, dst);
ZeroExtHalfWord(dst, dst);
}
void TurboAssembler::ByteReverseU32(Register dst, Register val) {
subi(sp, sp, Operand(kSystemPointerSize));
stw(val, MemOperand(sp));
lwbrx(dst, MemOperand(r0, sp));
addi(sp, sp, Operand(kSystemPointerSize));
void TurboAssembler::ByteReverseU32(Register dst, Register val,
Register scratch) {
if (CpuFeatures::IsSupported(PPC_10_PLUS)) {
brw(dst, val);
ZeroExtWord32(dst, dst);
return;
}
rotlwi(scratch, val, 8);
rlwimi(scratch, val, 24, 0, 7);
rlwimi(scratch, val, 24, 16, 23);
ZeroExtWord32(dst, dst);
}
void TurboAssembler::ByteReverseU64(Register dst, Register val) {
if (CpuFeatures::IsSupported(PPC_10_PLUS)) {
brd(dst, val);
return;
}
subi(sp, sp, Operand(kSystemPointerSize));
std(val, MemOperand(sp));
ldbrx(dst, MemOperand(r0, sp));
@ -3826,7 +3842,7 @@ void TurboAssembler::ReverseBitsU64(Register dst, Register src,
void TurboAssembler::ReverseBitsU32(Register dst, Register src,
Register scratch1, Register scratch2) {
ByteReverseU32(dst, src);
ByteReverseU32(dst, src, scratch1);
for (int i = 4; i < 8; i++) {
ReverseBitsInSingleByteU64(dst, dst, scratch1, scratch2, i);
}

View File

@ -612,8 +612,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
Simd128Register scratch);
void SwapSimd128(MemOperand src, MemOperand dst, Simd128Register scratch);
void ByteReverseU16(Register dst, Register val);
void ByteReverseU32(Register dst, Register val);
void ByteReverseU16(Register dst, Register val, Register scratch);
void ByteReverseU32(Register dst, Register val, Register scratch);
void ByteReverseU64(Register dst, Register val);
// Before calling a C-function from generated code, align arguments on stack.

View File

@ -2075,6 +2075,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
Register temp1 = r0;
if (CpuFeatures::IsSupported(PPC_10_PLUS)) {
__ brw(output, input);
__ extsw(output, output);
break;
}
__ rotlwi(temp1, input, 8);

View File

@ -559,9 +559,9 @@ constexpr bool is_be = false;
case StoreType::kI64Store16: { \
auto op_func = [&](Register dst, Register lhs, Register rhs) { \
if (is_be) { \
ByteReverseU16(dst, lhs); \
ByteReverseU16(dst, lhs, r0); \
instr(dst, dst, rhs); \
ByteReverseU16(dst, dst); \
ByteReverseU16(dst, dst, r0); \
} else { \
instr(dst, lhs, rhs); \
} \
@ -573,9 +573,9 @@ constexpr bool is_be = false;
case StoreType::kI64Store32: { \
auto op_func = [&](Register dst, Register lhs, Register rhs) { \
if (is_be) { \
ByteReverseU32(dst, lhs); \
ByteReverseU32(dst, lhs, r0); \
instr(dst, dst, rhs); \
ByteReverseU32(dst, dst); \
ByteReverseU32(dst, dst, r0); \
} else { \
instr(dst, lhs, rhs); \
} \
@ -657,9 +657,9 @@ void LiftoffAssembler::AtomicExchange(Register dst_addr, Register offset_reg,
case StoreType::kI32Store16:
case StoreType::kI64Store16: {
if (is_be) {
ByteReverseU16(r0, value.gp());
ByteReverseU16(r0, value.gp(), ip);
TurboAssembler::AtomicExchange<uint16_t>(dst, r0, result.gp());
ByteReverseU16(result.gp(), result.gp());
ByteReverseU16(result.gp(), result.gp(), ip);
} else {
TurboAssembler::AtomicExchange<uint16_t>(dst, value.gp(), result.gp());
}
@ -668,9 +668,9 @@ void LiftoffAssembler::AtomicExchange(Register dst_addr, Register offset_reg,
case StoreType::kI32Store:
case StoreType::kI64Store32: {
if (is_be) {
ByteReverseU32(r0, value.gp());
ByteReverseU32(r0, value.gp(), ip);
TurboAssembler::AtomicExchange<uint32_t>(dst, r0, result.gp());
ByteReverseU32(result.gp(), result.gp());
ByteReverseU32(result.gp(), result.gp(), ip);
} else {
TurboAssembler::AtomicExchange<uint32_t>(dst, value.gp(), result.gp());
}
@ -719,11 +719,11 @@ void LiftoffAssembler::AtomicCompareExchange(
case StoreType::kI64Store16: {
if (is_be) {
Push(new_value.gp(), expected.gp());
ByteReverseU16(new_value.gp(), new_value.gp());
ByteReverseU16(expected.gp(), expected.gp());
ByteReverseU16(new_value.gp(), new_value.gp(), r0);
ByteReverseU16(expected.gp(), expected.gp(), r0);
TurboAssembler::AtomicCompareExchange<uint16_t>(
dst, expected.gp(), new_value.gp(), result.gp(), r0);
ByteReverseU16(result.gp(), result.gp());
ByteReverseU16(result.gp(), result.gp(), r0);
Pop(new_value.gp(), expected.gp());
} else {
TurboAssembler::AtomicCompareExchange<uint16_t>(
@ -735,11 +735,11 @@ void LiftoffAssembler::AtomicCompareExchange(
case StoreType::kI64Store32: {
if (is_be) {
Push(new_value.gp(), expected.gp());
ByteReverseU32(new_value.gp(), new_value.gp());
ByteReverseU32(expected.gp(), expected.gp());
ByteReverseU32(new_value.gp(), new_value.gp(), r0);
ByteReverseU32(expected.gp(), expected.gp(), r0);
TurboAssembler::AtomicCompareExchange<uint32_t>(
dst, expected.gp(), new_value.gp(), result.gp(), r0);
ByteReverseU32(result.gp(), result.gp());
ByteReverseU32(result.gp(), result.gp(), r0);
Pop(new_value.gp(), expected.gp());
} else {
TurboAssembler::AtomicCompareExchange<uint32_t>(