PPC: optimize load/store of reversed bytes

We can detect the sequence during instruction selection and
if possible emit a single load/store byte reversed opcode instead
of doing the same separately (i.e load/store and then reverse).

Change-Id: Ib7d0c8c7105382637c33cafac5b5f4e23e8e553d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2950243
Reviewed-by: Junliang Yan <junyan@redhat.com>
Commit-Queue: Milad Fa <mfarazma@redhat.com>
Cr-Commit-Position: refs/heads/master@{#75076}
This commit is contained in:
Milad Fa 2021-06-09 17:06:04 -04:00 committed by V8 LUCI CQ
parent c5d41ae6d2
commit 2b9cd1c963
7 changed files with 134 additions and 6 deletions

View File

@ -1240,6 +1240,10 @@ using Instr = uint32_t;
V(stfsux, STFSUX, 0x7C00056E) \
/* Store Floating-Point Single Indexed */ \
V(stfsx, STFSX, 0x7C00052E) \
/* Store Doubleword Byte-Reverse Indexed */ \
V(stdbrx, STDBRX, 0x7C000528) \
/* Store Word Byte-Reverse Indexed */ \
V(stwbrx, STWBRX, 0x7C00052C) \
/* Load Vector Indexed */ \
V(lvx, LVX, 0x7C0000CE) \
/* Store Vector Indexed */ \
@ -1286,8 +1290,6 @@ using Instr = uint32_t;
V(lwax, LWAX, 0x7C0002AA) \
/* Parity Doubleword */ \
V(prtyd, PRTYD, 0x7C000174) \
/* Store Doubleword Byte-Reverse Indexed */ \
V(stdbrx, STDBRX, 0x7C000528) \
/* Trap Doubleword */ \
V(td, TD, 0x7C000088) \
/* Branch Conditional to Branch Target Address Register */ \
@ -1314,8 +1316,6 @@ using Instr = uint32_t;
V(prtyw, PRTYW, 0x7C000134) \
/* Store Halfword Byte-Reverse Indexed */ \
V(sthbrx, STHBRX, 0x7C00072C) \
/* Store Word Byte-Reverse Indexed */ \
V(stwbrx, STWBRX, 0x7C00052C) \
/* Synchronize */ \
V(sync, SYNC, 0x7C0004AC) \
/* Trap Word */ \

View File

@ -578,6 +578,18 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen, Instruction* instr,
DCHECK_EQ(LeaveRC, i.OutputRCBit()); \
} while (0)
#define ASSEMBLE_LOAD_INTEGER_RR(asm_instr) \
do { \
Register result = i.OutputRegister(); \
AddressingMode mode = kMode_None; \
MemOperand operand = i.MemoryOperand(&mode); \
DCHECK_EQ(mode, kMode_MRR); \
bool is_atomic = i.InputInt32(2); \
__ asm_instr(result, operand); \
if (is_atomic) __ lwsync(); \
DCHECK_EQ(LeaveRC, i.OutputRCBit()); \
} while (0)
#define ASSEMBLE_STORE_FLOAT(asm_instr, asm_instrx) \
do { \
size_t index = 0; \
@ -614,6 +626,20 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen, Instruction* instr,
DCHECK_EQ(LeaveRC, i.OutputRCBit()); \
} while (0)
#define ASSEMBLE_STORE_INTEGER_RR(asm_instr) \
do { \
size_t index = 0; \
AddressingMode mode = kMode_None; \
MemOperand operand = i.MemoryOperand(&mode, &index); \
DCHECK_EQ(mode, kMode_MRR); \
Register value = i.InputRegister(index); \
bool is_atomic = i.InputInt32(3); \
if (is_atomic) __ lwsync(); \
__ asm_instr(value, operand); \
if (is_atomic) __ sync(); \
DCHECK_EQ(LeaveRC, i.OutputRCBit()); \
} while (0)
#if V8_TARGET_ARCH_PPC64
// TODO(mbrandy): fix paths that produce garbage in offset's upper 32-bits.
#define CleanUInt32(x) __ ClearLeftImm(x, x, Operand(32))
@ -2213,7 +2239,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ extsw(output, temp1);
break;
}
#ifdef V8_TARGET_ARCH_PPC64
case kPPC_LoadByteRev32: {
ASSEMBLE_LOAD_INTEGER_RR(lwbrx);
EmitWordLoadPoisoningIfNeeded(this, instr, i);
break;
}
case kPPC_StoreByteRev32: {
ASSEMBLE_STORE_INTEGER_RR(stwbrx);
break;
}
case kPPC_ByteRev64: {
Register input = i.InputRegister(0);
Register output = i.OutputRegister();
@ -2231,7 +2265,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ orx(output, temp2, temp3);
break;
}
#endif // V8_TARGET_ARCH_PPC64
case kPPC_LoadByteRev64: {
ASSEMBLE_LOAD_INTEGER_RR(ldbrx);
EmitWordLoadPoisoningIfNeeded(this, instr, i);
break;
}
case kPPC_StoreByteRev64: {
ASSEMBLE_STORE_INTEGER_RR(stdbrx);
break;
}
case kPPC_F64x2Splat: {
constexpr int lane_width_in_bytes = 8;
Simd128Register dst = i.OutputSimd128Register();

View File

@ -120,7 +120,9 @@ namespace compiler {
V(PPC_LoadWordU16) \
V(PPC_LoadWordS32) \
V(PPC_LoadWordU32) \
V(PPC_LoadByteRev32) \
V(PPC_LoadWord64) \
V(PPC_LoadByteRev64) \
V(PPC_LoadFloat32) \
V(PPC_LoadDouble) \
V(PPC_LoadSimd128) \
@ -128,7 +130,9 @@ namespace compiler {
V(PPC_StoreWord8) \
V(PPC_StoreWord16) \
V(PPC_StoreWord32) \
V(PPC_StoreByteRev32) \
V(PPC_StoreWord64) \
V(PPC_StoreByteRev64) \
V(PPC_StoreFloat32) \
V(PPC_StoreDouble) \
V(PPC_StoreSimd128) \

View File

@ -328,7 +328,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kPPC_LoadWordU16:
case kPPC_LoadWordS32:
case kPPC_LoadWordU32:
case kPPC_LoadByteRev32:
case kPPC_LoadWord64:
case kPPC_LoadByteRev64:
case kPPC_LoadFloat32:
case kPPC_LoadDouble:
case kPPC_LoadSimd128:
@ -361,7 +363,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kPPC_StoreWord8:
case kPPC_StoreWord16:
case kPPC_StoreWord32:
case kPPC_StoreByteRev32:
case kPPC_StoreWord64:
case kPPC_StoreByteRev64:
case kPPC_StoreFloat32:
case kPPC_StoreDouble:
case kPPC_StoreSimd128:

View File

@ -316,6 +316,7 @@ void InstructionSelector::VisitStore(Node* node) {
} else {
ArchOpcode opcode;
ImmediateMode mode = kInt16Imm;
NodeMatcher m(value);
switch (rep) {
case MachineRepresentation::kFloat32:
opcode = kPPC_StoreFloat32;
@ -332,6 +333,11 @@ void InstructionSelector::VisitStore(Node* node) {
break;
case MachineRepresentation::kWord32:
opcode = kPPC_StoreWord32;
if (m.IsWord32ReverseBytes()) {
opcode = kPPC_StoreByteRev32;
value = value->InputAt(0);
mode = kNoImmediate;
}
break;
case MachineRepresentation::kCompressedPointer: // Fall through.
case MachineRepresentation::kCompressed:
@ -351,6 +357,11 @@ void InstructionSelector::VisitStore(Node* node) {
case MachineRepresentation::kWord64:
opcode = kPPC_StoreWord64;
mode = kInt16Imm_4ByteAligned;
if (m.IsWord64ReverseBytes()) {
opcode = kPPC_StoreByteRev64;
value = value->InputAt(0);
mode = kNoImmediate;
}
break;
case MachineRepresentation::kSimd128:
opcode = kPPC_StoreSimd128;
@ -975,12 +986,37 @@ void InstructionSelector::VisitWord64ReverseBits(Node* node) { UNREACHABLE(); }
void InstructionSelector::VisitWord64ReverseBytes(Node* node) {
PPCOperandGenerator g(this);
InstructionOperand temp[] = {g.TempRegister()};
NodeMatcher input(node->InputAt(0));
if (CanCover(node, input.node()) && input.IsLoad()) {
LoadRepresentation load_rep = LoadRepresentationOf(input.node()->op());
if (load_rep.representation() == MachineRepresentation::kWord64) {
Node* base = input.node()->InputAt(0);
Node* offset = input.node()->InputAt(1);
Emit(kPPC_LoadByteRev64 | AddressingModeField::encode(kMode_MRR),
g.DefineAsRegister(node), g.UseRegister(base),
g.UseRegister(offset));
return;
}
}
Emit(kPPC_ByteRev64, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)), 1, temp);
}
void InstructionSelector::VisitWord32ReverseBytes(Node* node) {
PPCOperandGenerator g(this);
NodeMatcher input(node->InputAt(0));
if (CanCover(node, input.node()) && input.IsLoad()) {
LoadRepresentation load_rep = LoadRepresentationOf(input.node()->op());
if (load_rep.representation() == MachineRepresentation::kWord32) {
Node* base = input.node()->InputAt(0);
Node* offset = input.node()->InputAt(1);
Emit(kPPC_LoadByteRev32 | AddressingModeField::encode(kMode_MRR),
g.DefineAsRegister(node), g.UseRegister(base),
g.UseRegister(offset));
return;
}
}
Emit(kPPC_ByteRev32, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)));
}

View File

@ -1105,6 +1105,18 @@ void Decoder::DecodeExt2(Instruction* instr) {
Format(instr, "ldbrx 'rt, 'ra, 'rb");
return;
}
case LWBRX: {
Format(instr, "lwbrx 'rt, 'ra, 'rb");
return;
}
case STDBRX: {
Format(instr, "stdbrx 'rs, 'ra, 'rb");
return;
}
case STWBRX: {
Format(instr, "stwbrx 'rs, 'ra, 'rb");
return;
}
case MTCRF: {
Format(instr, "mtcrf 'FXM, 'rs");
return;

View File

@ -2951,6 +2951,36 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
set_register(rt, result);
break;
}
case LWBRX: {
int rt = instr->RTValue();
int ra = instr->RAValue();
int rb = instr->RBValue();
intptr_t ra_val = ra == 0 ? 0 : get_register(ra);
intptr_t rb_val = get_register(rb);
intptr_t result = __builtin_bswap32(ReadW(ra_val + rb_val));
set_register(rt, result);
break;
}
case STDBRX: {
int rs = instr->RSValue();
int ra = instr->RAValue();
int rb = instr->RBValue();
intptr_t ra_val = ra == 0 ? 0 : get_register(ra);
intptr_t rs_val = get_register(rs);
intptr_t rb_val = get_register(rb);
WriteDW(ra_val + rb_val, __builtin_bswap64(rs_val));
break;
}
case STWBRX: {
int rs = instr->RSValue();
int ra = instr->RAValue();
int rb = instr->RBValue();
intptr_t ra_val = ra == 0 ? 0 : get_register(ra);
intptr_t rs_val = get_register(rs);
intptr_t rb_val = get_register(rb);
WriteW(ra_val + rb_val, __builtin_bswap32(rs_val));
break;
}
case STDX:
case STDUX: {
int rs = instr->RSValue();