PPC: optimize load/store of reversed bytes
We can detect the sequence during instruction selection and if possible emit a single load/store byte reversed opcode instead of doing the same separately (i.e load/store and then reverse). Change-Id: Ib7d0c8c7105382637c33cafac5b5f4e23e8e553d Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2950243 Reviewed-by: Junliang Yan <junyan@redhat.com> Commit-Queue: Milad Fa <mfarazma@redhat.com> Cr-Commit-Position: refs/heads/master@{#75076}
This commit is contained in:
parent
c5d41ae6d2
commit
2b9cd1c963
@ -1240,6 +1240,10 @@ using Instr = uint32_t;
|
||||
V(stfsux, STFSUX, 0x7C00056E) \
|
||||
/* Store Floating-Point Single Indexed */ \
|
||||
V(stfsx, STFSX, 0x7C00052E) \
|
||||
/* Store Doubleword Byte-Reverse Indexed */ \
|
||||
V(stdbrx, STDBRX, 0x7C000528) \
|
||||
/* Store Word Byte-Reverse Indexed */ \
|
||||
V(stwbrx, STWBRX, 0x7C00052C) \
|
||||
/* Load Vector Indexed */ \
|
||||
V(lvx, LVX, 0x7C0000CE) \
|
||||
/* Store Vector Indexed */ \
|
||||
@ -1286,8 +1290,6 @@ using Instr = uint32_t;
|
||||
V(lwax, LWAX, 0x7C0002AA) \
|
||||
/* Parity Doubleword */ \
|
||||
V(prtyd, PRTYD, 0x7C000174) \
|
||||
/* Store Doubleword Byte-Reverse Indexed */ \
|
||||
V(stdbrx, STDBRX, 0x7C000528) \
|
||||
/* Trap Doubleword */ \
|
||||
V(td, TD, 0x7C000088) \
|
||||
/* Branch Conditional to Branch Target Address Register */ \
|
||||
@ -1314,8 +1316,6 @@ using Instr = uint32_t;
|
||||
V(prtyw, PRTYW, 0x7C000134) \
|
||||
/* Store Halfword Byte-Reverse Indexed */ \
|
||||
V(sthbrx, STHBRX, 0x7C00072C) \
|
||||
/* Store Word Byte-Reverse Indexed */ \
|
||||
V(stwbrx, STWBRX, 0x7C00052C) \
|
||||
/* Synchronize */ \
|
||||
V(sync, SYNC, 0x7C0004AC) \
|
||||
/* Trap Word */ \
|
||||
|
@ -578,6 +578,18 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen, Instruction* instr,
|
||||
DCHECK_EQ(LeaveRC, i.OutputRCBit()); \
|
||||
} while (0)
|
||||
|
||||
#define ASSEMBLE_LOAD_INTEGER_RR(asm_instr) \
|
||||
do { \
|
||||
Register result = i.OutputRegister(); \
|
||||
AddressingMode mode = kMode_None; \
|
||||
MemOperand operand = i.MemoryOperand(&mode); \
|
||||
DCHECK_EQ(mode, kMode_MRR); \
|
||||
bool is_atomic = i.InputInt32(2); \
|
||||
__ asm_instr(result, operand); \
|
||||
if (is_atomic) __ lwsync(); \
|
||||
DCHECK_EQ(LeaveRC, i.OutputRCBit()); \
|
||||
} while (0)
|
||||
|
||||
#define ASSEMBLE_STORE_FLOAT(asm_instr, asm_instrx) \
|
||||
do { \
|
||||
size_t index = 0; \
|
||||
@ -614,6 +626,20 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen, Instruction* instr,
|
||||
DCHECK_EQ(LeaveRC, i.OutputRCBit()); \
|
||||
} while (0)
|
||||
|
||||
#define ASSEMBLE_STORE_INTEGER_RR(asm_instr) \
|
||||
do { \
|
||||
size_t index = 0; \
|
||||
AddressingMode mode = kMode_None; \
|
||||
MemOperand operand = i.MemoryOperand(&mode, &index); \
|
||||
DCHECK_EQ(mode, kMode_MRR); \
|
||||
Register value = i.InputRegister(index); \
|
||||
bool is_atomic = i.InputInt32(3); \
|
||||
if (is_atomic) __ lwsync(); \
|
||||
__ asm_instr(value, operand); \
|
||||
if (is_atomic) __ sync(); \
|
||||
DCHECK_EQ(LeaveRC, i.OutputRCBit()); \
|
||||
} while (0)
|
||||
|
||||
#if V8_TARGET_ARCH_PPC64
|
||||
// TODO(mbrandy): fix paths that produce garbage in offset's upper 32-bits.
|
||||
#define CleanUInt32(x) __ ClearLeftImm(x, x, Operand(32))
|
||||
@ -2213,7 +2239,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ extsw(output, temp1);
|
||||
break;
|
||||
}
|
||||
#ifdef V8_TARGET_ARCH_PPC64
|
||||
case kPPC_LoadByteRev32: {
|
||||
ASSEMBLE_LOAD_INTEGER_RR(lwbrx);
|
||||
EmitWordLoadPoisoningIfNeeded(this, instr, i);
|
||||
break;
|
||||
}
|
||||
case kPPC_StoreByteRev32: {
|
||||
ASSEMBLE_STORE_INTEGER_RR(stwbrx);
|
||||
break;
|
||||
}
|
||||
case kPPC_ByteRev64: {
|
||||
Register input = i.InputRegister(0);
|
||||
Register output = i.OutputRegister();
|
||||
@ -2231,7 +2265,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
__ orx(output, temp2, temp3);
|
||||
break;
|
||||
}
|
||||
#endif // V8_TARGET_ARCH_PPC64
|
||||
case kPPC_LoadByteRev64: {
|
||||
ASSEMBLE_LOAD_INTEGER_RR(ldbrx);
|
||||
EmitWordLoadPoisoningIfNeeded(this, instr, i);
|
||||
break;
|
||||
}
|
||||
case kPPC_StoreByteRev64: {
|
||||
ASSEMBLE_STORE_INTEGER_RR(stdbrx);
|
||||
break;
|
||||
}
|
||||
case kPPC_F64x2Splat: {
|
||||
constexpr int lane_width_in_bytes = 8;
|
||||
Simd128Register dst = i.OutputSimd128Register();
|
||||
|
@ -120,7 +120,9 @@ namespace compiler {
|
||||
V(PPC_LoadWordU16) \
|
||||
V(PPC_LoadWordS32) \
|
||||
V(PPC_LoadWordU32) \
|
||||
V(PPC_LoadByteRev32) \
|
||||
V(PPC_LoadWord64) \
|
||||
V(PPC_LoadByteRev64) \
|
||||
V(PPC_LoadFloat32) \
|
||||
V(PPC_LoadDouble) \
|
||||
V(PPC_LoadSimd128) \
|
||||
@ -128,7 +130,9 @@ namespace compiler {
|
||||
V(PPC_StoreWord8) \
|
||||
V(PPC_StoreWord16) \
|
||||
V(PPC_StoreWord32) \
|
||||
V(PPC_StoreByteRev32) \
|
||||
V(PPC_StoreWord64) \
|
||||
V(PPC_StoreByteRev64) \
|
||||
V(PPC_StoreFloat32) \
|
||||
V(PPC_StoreDouble) \
|
||||
V(PPC_StoreSimd128) \
|
||||
|
@ -328,7 +328,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kPPC_LoadWordU16:
|
||||
case kPPC_LoadWordS32:
|
||||
case kPPC_LoadWordU32:
|
||||
case kPPC_LoadByteRev32:
|
||||
case kPPC_LoadWord64:
|
||||
case kPPC_LoadByteRev64:
|
||||
case kPPC_LoadFloat32:
|
||||
case kPPC_LoadDouble:
|
||||
case kPPC_LoadSimd128:
|
||||
@ -361,7 +363,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
|
||||
case kPPC_StoreWord8:
|
||||
case kPPC_StoreWord16:
|
||||
case kPPC_StoreWord32:
|
||||
case kPPC_StoreByteRev32:
|
||||
case kPPC_StoreWord64:
|
||||
case kPPC_StoreByteRev64:
|
||||
case kPPC_StoreFloat32:
|
||||
case kPPC_StoreDouble:
|
||||
case kPPC_StoreSimd128:
|
||||
|
@ -316,6 +316,7 @@ void InstructionSelector::VisitStore(Node* node) {
|
||||
} else {
|
||||
ArchOpcode opcode;
|
||||
ImmediateMode mode = kInt16Imm;
|
||||
NodeMatcher m(value);
|
||||
switch (rep) {
|
||||
case MachineRepresentation::kFloat32:
|
||||
opcode = kPPC_StoreFloat32;
|
||||
@ -332,6 +333,11 @@ void InstructionSelector::VisitStore(Node* node) {
|
||||
break;
|
||||
case MachineRepresentation::kWord32:
|
||||
opcode = kPPC_StoreWord32;
|
||||
if (m.IsWord32ReverseBytes()) {
|
||||
opcode = kPPC_StoreByteRev32;
|
||||
value = value->InputAt(0);
|
||||
mode = kNoImmediate;
|
||||
}
|
||||
break;
|
||||
case MachineRepresentation::kCompressedPointer: // Fall through.
|
||||
case MachineRepresentation::kCompressed:
|
||||
@ -351,6 +357,11 @@ void InstructionSelector::VisitStore(Node* node) {
|
||||
case MachineRepresentation::kWord64:
|
||||
opcode = kPPC_StoreWord64;
|
||||
mode = kInt16Imm_4ByteAligned;
|
||||
if (m.IsWord64ReverseBytes()) {
|
||||
opcode = kPPC_StoreByteRev64;
|
||||
value = value->InputAt(0);
|
||||
mode = kNoImmediate;
|
||||
}
|
||||
break;
|
||||
case MachineRepresentation::kSimd128:
|
||||
opcode = kPPC_StoreSimd128;
|
||||
@ -975,12 +986,37 @@ void InstructionSelector::VisitWord64ReverseBits(Node* node) { UNREACHABLE(); }
|
||||
void InstructionSelector::VisitWord64ReverseBytes(Node* node) {
|
||||
PPCOperandGenerator g(this);
|
||||
InstructionOperand temp[] = {g.TempRegister()};
|
||||
NodeMatcher input(node->InputAt(0));
|
||||
if (CanCover(node, input.node()) && input.IsLoad()) {
|
||||
LoadRepresentation load_rep = LoadRepresentationOf(input.node()->op());
|
||||
if (load_rep.representation() == MachineRepresentation::kWord64) {
|
||||
Node* base = input.node()->InputAt(0);
|
||||
Node* offset = input.node()->InputAt(1);
|
||||
Emit(kPPC_LoadByteRev64 | AddressingModeField::encode(kMode_MRR),
|
||||
g.DefineAsRegister(node), g.UseRegister(base),
|
||||
g.UseRegister(offset));
|
||||
return;
|
||||
}
|
||||
}
|
||||
Emit(kPPC_ByteRev64, g.DefineAsRegister(node),
|
||||
g.UseUniqueRegister(node->InputAt(0)), 1, temp);
|
||||
}
|
||||
|
||||
void InstructionSelector::VisitWord32ReverseBytes(Node* node) {
|
||||
PPCOperandGenerator g(this);
|
||||
|
||||
NodeMatcher input(node->InputAt(0));
|
||||
if (CanCover(node, input.node()) && input.IsLoad()) {
|
||||
LoadRepresentation load_rep = LoadRepresentationOf(input.node()->op());
|
||||
if (load_rep.representation() == MachineRepresentation::kWord32) {
|
||||
Node* base = input.node()->InputAt(0);
|
||||
Node* offset = input.node()->InputAt(1);
|
||||
Emit(kPPC_LoadByteRev32 | AddressingModeField::encode(kMode_MRR),
|
||||
g.DefineAsRegister(node), g.UseRegister(base),
|
||||
g.UseRegister(offset));
|
||||
return;
|
||||
}
|
||||
}
|
||||
Emit(kPPC_ByteRev32, g.DefineAsRegister(node),
|
||||
g.UseRegister(node->InputAt(0)));
|
||||
}
|
||||
|
@ -1105,6 +1105,18 @@ void Decoder::DecodeExt2(Instruction* instr) {
|
||||
Format(instr, "ldbrx 'rt, 'ra, 'rb");
|
||||
return;
|
||||
}
|
||||
case LWBRX: {
|
||||
Format(instr, "lwbrx 'rt, 'ra, 'rb");
|
||||
return;
|
||||
}
|
||||
case STDBRX: {
|
||||
Format(instr, "stdbrx 'rs, 'ra, 'rb");
|
||||
return;
|
||||
}
|
||||
case STWBRX: {
|
||||
Format(instr, "stwbrx 'rs, 'ra, 'rb");
|
||||
return;
|
||||
}
|
||||
case MTCRF: {
|
||||
Format(instr, "mtcrf 'FXM, 'rs");
|
||||
return;
|
||||
|
@ -2951,6 +2951,36 @@ void Simulator::ExecuteGeneric(Instruction* instr) {
|
||||
set_register(rt, result);
|
||||
break;
|
||||
}
|
||||
case LWBRX: {
|
||||
int rt = instr->RTValue();
|
||||
int ra = instr->RAValue();
|
||||
int rb = instr->RBValue();
|
||||
intptr_t ra_val = ra == 0 ? 0 : get_register(ra);
|
||||
intptr_t rb_val = get_register(rb);
|
||||
intptr_t result = __builtin_bswap32(ReadW(ra_val + rb_val));
|
||||
set_register(rt, result);
|
||||
break;
|
||||
}
|
||||
case STDBRX: {
|
||||
int rs = instr->RSValue();
|
||||
int ra = instr->RAValue();
|
||||
int rb = instr->RBValue();
|
||||
intptr_t ra_val = ra == 0 ? 0 : get_register(ra);
|
||||
intptr_t rs_val = get_register(rs);
|
||||
intptr_t rb_val = get_register(rb);
|
||||
WriteDW(ra_val + rb_val, __builtin_bswap64(rs_val));
|
||||
break;
|
||||
}
|
||||
case STWBRX: {
|
||||
int rs = instr->RSValue();
|
||||
int ra = instr->RAValue();
|
||||
int rb = instr->RBValue();
|
||||
intptr_t ra_val = ra == 0 ? 0 : get_register(ra);
|
||||
intptr_t rs_val = get_register(rs);
|
||||
intptr_t rb_val = get_register(rb);
|
||||
WriteW(ra_val + rb_val, __builtin_bswap32(rs_val));
|
||||
break;
|
||||
}
|
||||
case STDX:
|
||||
case STDUX: {
|
||||
int rs = instr->RSValue();
|
||||
|
Loading…
Reference in New Issue
Block a user