[wasm][simd][x64] Improve F32x4Min, F32x4Max

- Use a shorter code sequence for the most likely case (no NaNs or
  signed 0 errors), and use out-of-line code to handle those
  cases.
- For the likely execution paths, F32x4Min goes from 8 to 6
  instructions, while F32x4Max goes from 9 to 6 instructions.
- Code size increases by 2 and 3 instructions (the test and branch,
  and for max, an extra move.

Bug: v8:8639
Change-Id: I7966f652c89545e840ae493f25dd652b1e079b91
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2231653
Reviewed-by: Zhi An Ng <zhin@chromium.org>
Commit-Queue: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#68209}
This commit is contained in:
Bill Budge 2020-06-04 17:23:40 -07:00 committed by Commit Bot
parent 40527340cd
commit f3dd0f4803

View File

@ -194,6 +194,50 @@ class OutOfLineLoadFloat64NaN final : public OutOfLineCode {
XMMRegister const result_;
};
class OutOfLineF32x4Min final : public OutOfLineCode {
public:
OutOfLineF32x4Min(CodeGenerator* gen, XMMRegister result, XMMRegister error)
: OutOfLineCode(gen), result_(result), error_(error) {}
void Generate() final {
// |result| is the partial result, |kScratchDoubleReg| is the error.
// propagate -0's and NaNs (possibly non-canonical) from the error.
__ Orps(error_, result_);
// Canonicalize NaNs by quieting and clearing the payload.
__ Cmpps(result_, error_, int8_t{3});
__ Orps(error_, result_);
__ Psrld(result_, byte{10});
__ Andnps(result_, error_);
}
private:
XMMRegister const result_;
XMMRegister const error_;
};
class OutOfLineF32x4Max final : public OutOfLineCode {
public:
OutOfLineF32x4Max(CodeGenerator* gen, XMMRegister result, XMMRegister error)
: OutOfLineCode(gen), result_(result), error_(error) {}
void Generate() final {
// |result| is the partial result, |kScratchDoubleReg| is the error.
// Propagate NaNs (possibly non-canonical).
__ Orps(result_, error_);
// Propagate sign errors and (subtle) quiet NaNs.
__ Subps(result_, error_);
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
__ Cmpps(error_, result_, int8_t{3});
__ Psrld(error_, byte{10});
__ Andnps(error_, result_);
__ Movaps(result_, error_);
}
private:
XMMRegister const result_;
XMMRegister const error_;
};
class OutOfLineTruncateDoubleToI final : public OutOfLineCode {
public:
OutOfLineTruncateDoubleToI(CodeGenerator* gen, Register result,
@ -2524,18 +2568,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister src1 = i.InputSimd128Register(1),
dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
// The minps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minps in both orders, merge the resuls, and adjust.
// The minps instruction doesn't propagate NaNs and -0's in its first
// operand. Perform minps in both orders and compare results. Handle the
// unlikely case of discrepancies out of line.
__ Movaps(kScratchDoubleReg, src1);
__ Minps(kScratchDoubleReg, dst);
__ Minps(dst, src1);
// propagate -0's and NaNs, which may be non-canonical.
__ Orps(kScratchDoubleReg, dst);
// Canonicalize NaNs by quieting and clearing the payload.
__ Cmpps(dst, kScratchDoubleReg, int8_t{3});
__ Orps(kScratchDoubleReg, dst);
__ Psrld(dst, byte{10});
__ Andnps(dst, kScratchDoubleReg);
// Most likely there is no difference and we're done.
__ Xorps(kScratchDoubleReg, dst);
__ Ptest(kScratchDoubleReg, kScratchDoubleReg);
auto ool = new (zone()) OutOfLineF32x4Min(this, dst, kScratchDoubleReg);
__ j(not_zero, ool->entry());
__ bind(ool->exit());
break;
}
case kX64F32x4Max: {
@ -2543,20 +2587,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
// The maxps instruction doesn't propagate NaNs and +0's in its first
// operand. Perform maxps in both orders, merge the resuls, and adjust.
// operand. Perform maxps in both orders and compare results. Handle the
// unlikely case of discrepancies out of line.
__ Movaps(kScratchDoubleReg, src1);
__ Maxps(kScratchDoubleReg, dst);
__ Maxps(dst, src1);
// Find discrepancies.
__ Xorps(dst, kScratchDoubleReg);
// Propagate NaNs, which may be non-canonical.
__ Orps(kScratchDoubleReg, dst);
// Propagate sign discrepancy and (subtle) quiet NaNs.
__ Subps(kScratchDoubleReg, dst);
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
__ Cmpps(dst, kScratchDoubleReg, int8_t{3});
__ Psrld(dst, byte{10});
__ Andnps(dst, kScratchDoubleReg);
// Most likely there is no difference and we're done.
__ Xorps(kScratchDoubleReg, dst);
__ Ptest(kScratchDoubleReg, kScratchDoubleReg);
auto ool = new (zone()) OutOfLineF32x4Max(this, dst, kScratchDoubleReg);
__ j(not_zero, ool->entry());
__ bind(ool->exit());
break;
}
case kX64F32x4Eq: {