[wasm][simd][x64] Improve F32x4Min, F32x4Max
- Use a shorter code sequence for the most likely case (no NaNs or signed 0 errors), and use out-of-line code to handle those cases. - For the likely execution paths, F32x4Min goes from 8 to 6 instructions, while F32x4Max goes from 9 to 6 instructions. - Code size increases by 2 and 3 instructions (the test and branch, and for max, an extra move. Bug: v8:8639 Change-Id: I7966f652c89545e840ae493f25dd652b1e079b91 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2231653 Reviewed-by: Zhi An Ng <zhin@chromium.org> Commit-Queue: Bill Budge <bbudge@chromium.org> Cr-Commit-Position: refs/heads/master@{#68209}
This commit is contained in:
parent
40527340cd
commit
f3dd0f4803
@ -194,6 +194,50 @@ class OutOfLineLoadFloat64NaN final : public OutOfLineCode {
|
||||
XMMRegister const result_;
|
||||
};
|
||||
|
||||
class OutOfLineF32x4Min final : public OutOfLineCode {
|
||||
public:
|
||||
OutOfLineF32x4Min(CodeGenerator* gen, XMMRegister result, XMMRegister error)
|
||||
: OutOfLineCode(gen), result_(result), error_(error) {}
|
||||
|
||||
void Generate() final {
|
||||
// |result| is the partial result, |kScratchDoubleReg| is the error.
|
||||
// propagate -0's and NaNs (possibly non-canonical) from the error.
|
||||
__ Orps(error_, result_);
|
||||
// Canonicalize NaNs by quieting and clearing the payload.
|
||||
__ Cmpps(result_, error_, int8_t{3});
|
||||
__ Orps(error_, result_);
|
||||
__ Psrld(result_, byte{10});
|
||||
__ Andnps(result_, error_);
|
||||
}
|
||||
|
||||
private:
|
||||
XMMRegister const result_;
|
||||
XMMRegister const error_;
|
||||
};
|
||||
|
||||
class OutOfLineF32x4Max final : public OutOfLineCode {
|
||||
public:
|
||||
OutOfLineF32x4Max(CodeGenerator* gen, XMMRegister result, XMMRegister error)
|
||||
: OutOfLineCode(gen), result_(result), error_(error) {}
|
||||
|
||||
void Generate() final {
|
||||
// |result| is the partial result, |kScratchDoubleReg| is the error.
|
||||
// Propagate NaNs (possibly non-canonical).
|
||||
__ Orps(result_, error_);
|
||||
// Propagate sign errors and (subtle) quiet NaNs.
|
||||
__ Subps(result_, error_);
|
||||
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
|
||||
__ Cmpps(error_, result_, int8_t{3});
|
||||
__ Psrld(error_, byte{10});
|
||||
__ Andnps(error_, result_);
|
||||
__ Movaps(result_, error_);
|
||||
}
|
||||
|
||||
private:
|
||||
XMMRegister const result_;
|
||||
XMMRegister const error_;
|
||||
};
|
||||
|
||||
class OutOfLineTruncateDoubleToI final : public OutOfLineCode {
|
||||
public:
|
||||
OutOfLineTruncateDoubleToI(CodeGenerator* gen, Register result,
|
||||
@ -2524,18 +2568,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
XMMRegister src1 = i.InputSimd128Register(1),
|
||||
dst = i.OutputSimd128Register();
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
// The minps instruction doesn't propagate NaNs and +0's in its first
|
||||
// operand. Perform minps in both orders, merge the resuls, and adjust.
|
||||
// The minps instruction doesn't propagate NaNs and -0's in its first
|
||||
// operand. Perform minps in both orders and compare results. Handle the
|
||||
// unlikely case of discrepancies out of line.
|
||||
__ Movaps(kScratchDoubleReg, src1);
|
||||
__ Minps(kScratchDoubleReg, dst);
|
||||
__ Minps(dst, src1);
|
||||
// propagate -0's and NaNs, which may be non-canonical.
|
||||
__ Orps(kScratchDoubleReg, dst);
|
||||
// Canonicalize NaNs by quieting and clearing the payload.
|
||||
__ Cmpps(dst, kScratchDoubleReg, int8_t{3});
|
||||
__ Orps(kScratchDoubleReg, dst);
|
||||
__ Psrld(dst, byte{10});
|
||||
__ Andnps(dst, kScratchDoubleReg);
|
||||
// Most likely there is no difference and we're done.
|
||||
__ Xorps(kScratchDoubleReg, dst);
|
||||
__ Ptest(kScratchDoubleReg, kScratchDoubleReg);
|
||||
auto ool = new (zone()) OutOfLineF32x4Min(this, dst, kScratchDoubleReg);
|
||||
__ j(not_zero, ool->entry());
|
||||
__ bind(ool->exit());
|
||||
break;
|
||||
}
|
||||
case kX64F32x4Max: {
|
||||
@ -2543,20 +2587,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
||||
dst = i.OutputSimd128Register();
|
||||
DCHECK_EQ(dst, i.InputSimd128Register(0));
|
||||
// The maxps instruction doesn't propagate NaNs and +0's in its first
|
||||
// operand. Perform maxps in both orders, merge the resuls, and adjust.
|
||||
// operand. Perform maxps in both orders and compare results. Handle the
|
||||
// unlikely case of discrepancies out of line.
|
||||
__ Movaps(kScratchDoubleReg, src1);
|
||||
__ Maxps(kScratchDoubleReg, dst);
|
||||
__ Maxps(dst, src1);
|
||||
// Find discrepancies.
|
||||
__ Xorps(dst, kScratchDoubleReg);
|
||||
// Propagate NaNs, which may be non-canonical.
|
||||
__ Orps(kScratchDoubleReg, dst);
|
||||
// Propagate sign discrepancy and (subtle) quiet NaNs.
|
||||
__ Subps(kScratchDoubleReg, dst);
|
||||
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
|
||||
__ Cmpps(dst, kScratchDoubleReg, int8_t{3});
|
||||
__ Psrld(dst, byte{10});
|
||||
__ Andnps(dst, kScratchDoubleReg);
|
||||
// Most likely there is no difference and we're done.
|
||||
__ Xorps(kScratchDoubleReg, dst);
|
||||
__ Ptest(kScratchDoubleReg, kScratchDoubleReg);
|
||||
auto ool = new (zone()) OutOfLineF32x4Max(this, dst, kScratchDoubleReg);
|
||||
__ j(not_zero, ool->entry());
|
||||
__ bind(ool->exit());
|
||||
break;
|
||||
}
|
||||
case kX64F32x4Eq: {
|
||||
|
Loading…
Reference in New Issue
Block a user