[wasm-simd][ia32][liftoff] Implement double precision conversions
Extract codegen into macro-assembler functions for reuse in Liftoff. Some minor tweaks in I32x4TruncSatF64x2SZero and I32x4TruncSatF64x2UZero to check dst and src overlap and move to scratch/dst accordingly. In TurboFan we can set these restrictions in the instruction-selector, but not in Liftoff. This doesn't make TurboFan codegen any worse, since those restrictions are still in place. Bug: v8:11265 Change-Id: I48f354c5ff86809bb3ddc38eca6dc8990b9b7d61 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2683208 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Clemens Backes <clemensb@chromium.org> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org> Cr-Commit-Position: refs/heads/master@{#72641}
This commit is contained in:
parent
8dd251b8e4
commit
2367a71489
@ -867,6 +867,101 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TurboAssembler::F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src,
|
||||||
|
Register tmp) {
|
||||||
|
// dst = [ src_low, 0x43300000, src_high, 0x4330000 ];
|
||||||
|
// 0x43300000'00000000 is a special double where the significand bits
|
||||||
|
// precisely represents all uint32 numbers.
|
||||||
|
Unpcklps(dst, src,
|
||||||
|
ExternalReferenceAsOperand(
|
||||||
|
ExternalReference::
|
||||||
|
address_of_wasm_f64x2_convert_low_i32x4_u_int_mask(),
|
||||||
|
tmp));
|
||||||
|
Subpd(dst, dst,
|
||||||
|
ExternalReferenceAsOperand(
|
||||||
|
ExternalReference::address_of_wasm_double_2_power_52(), tmp));
|
||||||
|
}
|
||||||
|
|
||||||
|
void TurboAssembler::I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src,
|
||||||
|
XMMRegister scratch,
|
||||||
|
Register tmp) {
|
||||||
|
if (CpuFeatures::IsSupported(AVX)) {
|
||||||
|
CpuFeatureScope avx_scope(this, AVX);
|
||||||
|
XMMRegister original_dst = dst;
|
||||||
|
// Make sure we don't overwrite src.
|
||||||
|
if (dst == src) {
|
||||||
|
DCHECK_NE(scratch, src);
|
||||||
|
dst = scratch;
|
||||||
|
}
|
||||||
|
// dst = 0 if src == NaN, else all ones.
|
||||||
|
vcmpeqpd(dst, src, src);
|
||||||
|
// dst = 0 if src == NaN, else INT32_MAX as double.
|
||||||
|
vandpd(dst, dst,
|
||||||
|
ExternalReferenceAsOperand(
|
||||||
|
ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
|
||||||
|
// dst = 0 if src == NaN, src is saturated to INT32_MAX as double.
|
||||||
|
vminpd(dst, src, dst);
|
||||||
|
// Values > INT32_MAX already saturated, values < INT32_MIN raises an
|
||||||
|
// exception, which is masked and returns 0x80000000.
|
||||||
|
vcvttpd2dq(dst, dst);
|
||||||
|
|
||||||
|
if (original_dst != dst) {
|
||||||
|
vmovaps(original_dst, dst);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (dst != src) {
|
||||||
|
movaps(dst, src);
|
||||||
|
}
|
||||||
|
movaps(scratch, dst);
|
||||||
|
cmpeqpd(scratch, dst);
|
||||||
|
andps(scratch,
|
||||||
|
ExternalReferenceAsOperand(
|
||||||
|
ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
|
||||||
|
minpd(dst, scratch);
|
||||||
|
cvttpd2dq(dst, dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TurboAssembler::I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src,
|
||||||
|
XMMRegister scratch,
|
||||||
|
Register tmp) {
|
||||||
|
if (CpuFeatures::IsSupported(AVX)) {
|
||||||
|
CpuFeatureScope avx_scope(this, AVX);
|
||||||
|
vxorpd(scratch, scratch, scratch);
|
||||||
|
// Saturate to 0.
|
||||||
|
vmaxpd(dst, src, scratch);
|
||||||
|
// Saturate to UINT32_MAX.
|
||||||
|
vminpd(dst, dst,
|
||||||
|
ExternalReferenceAsOperand(
|
||||||
|
ExternalReference::address_of_wasm_uint32_max_as_double(), tmp));
|
||||||
|
// Truncate.
|
||||||
|
vroundpd(dst, dst, kRoundToZero);
|
||||||
|
// Add to special double where significant bits == uint32.
|
||||||
|
vaddpd(dst, dst,
|
||||||
|
ExternalReferenceAsOperand(
|
||||||
|
ExternalReference::address_of_wasm_double_2_power_52(), tmp));
|
||||||
|
// Extract low 32 bits of each double's significand, zero top lanes.
|
||||||
|
// dst = [dst[0], dst[2], 0, 0]
|
||||||
|
vshufps(dst, dst, scratch, 0x88);
|
||||||
|
} else {
|
||||||
|
CpuFeatureScope scope(this, SSE4_1);
|
||||||
|
if (dst != src) {
|
||||||
|
movaps(dst, src);
|
||||||
|
}
|
||||||
|
|
||||||
|
xorps(scratch, scratch);
|
||||||
|
maxpd(dst, scratch);
|
||||||
|
minpd(dst,
|
||||||
|
ExternalReferenceAsOperand(
|
||||||
|
ExternalReference::address_of_wasm_uint32_max_as_double(), tmp));
|
||||||
|
roundpd(dst, dst, kRoundToZero);
|
||||||
|
addpd(dst,
|
||||||
|
ExternalReferenceAsOperand(
|
||||||
|
ExternalReference::address_of_wasm_double_2_power_52(), tmp));
|
||||||
|
shufps(dst, scratch, 0x88);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
|
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
|
||||||
DCHECK_GE(63, shift);
|
DCHECK_GE(63, shift);
|
||||||
if (shift >= 32) {
|
if (shift >= 32) {
|
||||||
|
@ -664,6 +664,11 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
|
|||||||
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
|
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
|
||||||
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
|
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
|
||||||
XMMRegister tmp2, Register scratch);
|
XMMRegister tmp2, Register scratch);
|
||||||
|
void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src, Register tmp);
|
||||||
|
void I32x4TruncSatF64x2SZero(XMMRegister dst, XMMRegister src,
|
||||||
|
XMMRegister scratch, Register tmp);
|
||||||
|
void I32x4TruncSatF64x2UZero(XMMRegister dst, XMMRegister src,
|
||||||
|
XMMRegister scratch, Register tmp);
|
||||||
|
|
||||||
void Push(Register src) { push(src); }
|
void Push(Register src) { push(src); }
|
||||||
void Push(Operand src) { push(src); }
|
void Push(Operand src) { push(src); }
|
||||||
|
@ -2048,81 +2048,15 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kIA32I32x4TruncSatF64x2SZero: {
|
case kIA32I32x4TruncSatF64x2SZero: {
|
||||||
XMMRegister dst = i.OutputSimd128Register();
|
__ I32x4TruncSatF64x2SZero(i.OutputSimd128Register(),
|
||||||
XMMRegister src = i.InputSimd128Register(0);
|
i.InputSimd128Register(0), kScratchDoubleReg,
|
||||||
Register tmp = i.TempRegister(0);
|
i.TempRegister(0));
|
||||||
|
|
||||||
if (CpuFeatures::IsSupported(AVX)) {
|
|
||||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
|
||||||
DCHECK_NE(dst, src);
|
|
||||||
// dst = 0 if src == NaN, else all ones.
|
|
||||||
__ vcmpeqpd(dst, src, src);
|
|
||||||
// dst = 0 if src == NaN, else INT32_MAX as double.
|
|
||||||
__ vandpd(
|
|
||||||
dst, dst,
|
|
||||||
__ ExternalReferenceAsOperand(
|
|
||||||
ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
|
|
||||||
// dst = 0 if src == NaN, src is saturated to INT32_MAX as double.
|
|
||||||
__ vminpd(dst, src, dst);
|
|
||||||
// Values > INT32_MAX already saturated, values < INT32_MIN raises an
|
|
||||||
// exception, which is masked and returns 0x80000000.
|
|
||||||
__ vcvttpd2dq(dst, dst);
|
|
||||||
} else {
|
|
||||||
DCHECK_EQ(dst, src);
|
|
||||||
__ movaps(kScratchDoubleReg, src);
|
|
||||||
__ cmpeqpd(kScratchDoubleReg, src);
|
|
||||||
__ andps(
|
|
||||||
kScratchDoubleReg,
|
|
||||||
__ ExternalReferenceAsOperand(
|
|
||||||
ExternalReference::address_of_wasm_int32_max_as_double(), tmp));
|
|
||||||
__ minpd(dst, kScratchDoubleReg);
|
|
||||||
__ cvttpd2dq(dst, dst);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kIA32I32x4TruncSatF64x2UZero: {
|
case kIA32I32x4TruncSatF64x2UZero: {
|
||||||
XMMRegister dst = i.OutputSimd128Register();
|
__ I32x4TruncSatF64x2UZero(i.OutputSimd128Register(),
|
||||||
XMMRegister src = i.InputSimd128Register(0);
|
i.InputSimd128Register(0), kScratchDoubleReg,
|
||||||
Register tmp = i.TempRegister(0);
|
i.TempRegister(0));
|
||||||
|
|
||||||
if (CpuFeatures::IsSupported(AVX)) {
|
|
||||||
CpuFeatureScope avx_scope(tasm(), AVX);
|
|
||||||
__ vxorpd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
|
|
||||||
// Saturate to 0.
|
|
||||||
__ vmaxpd(dst, src, kScratchDoubleReg);
|
|
||||||
// Saturate to UINT32_MAX.
|
|
||||||
__ vminpd(dst, dst,
|
|
||||||
__ ExternalReferenceAsOperand(
|
|
||||||
ExternalReference::address_of_wasm_uint32_max_as_double(),
|
|
||||||
tmp));
|
|
||||||
// Truncate.
|
|
||||||
__ vroundpd(dst, dst, kRoundToZero);
|
|
||||||
// Add to special double where significant bits == uint32.
|
|
||||||
__ vaddpd(
|
|
||||||
dst, dst,
|
|
||||||
__ ExternalReferenceAsOperand(
|
|
||||||
ExternalReference::address_of_wasm_double_2_power_52(), tmp));
|
|
||||||
// Extract low 32 bits of each double's significand, zero top lanes.
|
|
||||||
// dst = [dst[0], dst[2], 0, 0]
|
|
||||||
__ vshufps(dst, dst, kScratchDoubleReg, 0x88);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
CpuFeatureScope scope(tasm(), SSE4_1);
|
|
||||||
DCHECK_EQ(dst, src);
|
|
||||||
__ xorps(kScratchDoubleReg, kScratchDoubleReg);
|
|
||||||
__ maxpd(dst, kScratchDoubleReg);
|
|
||||||
__ minpd(dst,
|
|
||||||
__ ExternalReferenceAsOperand(
|
|
||||||
ExternalReference::address_of_wasm_uint32_max_as_double(),
|
|
||||||
tmp));
|
|
||||||
__ roundpd(dst, dst, kRoundToZero);
|
|
||||||
__ addpd(
|
|
||||||
dst,
|
|
||||||
__ ExternalReferenceAsOperand(
|
|
||||||
ExternalReference::address_of_wasm_double_2_power_52(), tmp));
|
|
||||||
__ shufps(dst, kScratchDoubleReg, 0x88);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kIA32F64x2ConvertLowI32x4S: {
|
case kIA32F64x2ConvertLowI32x4S: {
|
||||||
@ -2130,21 +2064,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kIA32F64x2ConvertLowI32x4U: {
|
case kIA32F64x2ConvertLowI32x4U: {
|
||||||
XMMRegister dst = i.OutputSimd128Register();
|
__ F64x2ConvertLowI32x4U(i.OutputSimd128Register(),
|
||||||
XMMRegister src = i.InputSimd128Register(0);
|
i.InputSimd128Register(0), i.TempRegister(0));
|
||||||
Register tmp = i.TempRegister(0);
|
|
||||||
// dst = [ src_low, 0x43300000, src_high, 0x4330000 ];
|
|
||||||
// 0x43300000'00000000 is a special double where the significand bits
|
|
||||||
// precisely represents all uint32 numbers.
|
|
||||||
__ Unpcklps(dst, src,
|
|
||||||
__ ExternalReferenceAsOperand(
|
|
||||||
ExternalReference::
|
|
||||||
address_of_wasm_f64x2_convert_low_i32x4_u_int_mask(),
|
|
||||||
tmp));
|
|
||||||
__ Subpd(
|
|
||||||
dst, dst,
|
|
||||||
__ ExternalReferenceAsOperand(
|
|
||||||
ExternalReference::address_of_wasm_double_2_power_52(), tmp));
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kIA32I64x2ExtMulLowI32x4S: {
|
case kIA32I64x2ExtMulLowI32x4S: {
|
||||||
|
@ -4296,17 +4296,18 @@ void LiftoffAssembler::emit_f64x2_pmax(LiftoffRegister dst, LiftoffRegister lhs,
|
|||||||
|
|
||||||
void LiftoffAssembler::emit_f64x2_convert_low_i32x4_s(LiftoffRegister dst,
|
void LiftoffAssembler::emit_f64x2_convert_low_i32x4_s(LiftoffRegister dst,
|
||||||
LiftoffRegister src) {
|
LiftoffRegister src) {
|
||||||
bailout(kSimd, "f64x2.convert_low_i32x4_s");
|
Cvtdq2pd(dst.fp(), src.fp());
|
||||||
}
|
}
|
||||||
|
|
||||||
void LiftoffAssembler::emit_f64x2_convert_low_i32x4_u(LiftoffRegister dst,
|
void LiftoffAssembler::emit_f64x2_convert_low_i32x4_u(LiftoffRegister dst,
|
||||||
LiftoffRegister src) {
|
LiftoffRegister src) {
|
||||||
bailout(kSimd, "f64x2.convert_low_i32x4_u");
|
Register tmp = GetUnusedRegister(kGpReg, {}).gp();
|
||||||
|
F64x2ConvertLowI32x4U(dst.fp(), src.fp(), tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LiftoffAssembler::emit_f64x2_promote_low_f32x4(LiftoffRegister dst,
|
void LiftoffAssembler::emit_f64x2_promote_low_f32x4(LiftoffRegister dst,
|
||||||
LiftoffRegister src) {
|
LiftoffRegister src) {
|
||||||
bailout(kSimd, "f64x2.promote_low_f32x4");
|
Cvtps2pd(dst.fp(), src.fp());
|
||||||
}
|
}
|
||||||
|
|
||||||
void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
|
void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
|
||||||
@ -4402,7 +4403,7 @@ void LiftoffAssembler::emit_f32x4_uconvert_i32x4(LiftoffRegister dst,
|
|||||||
|
|
||||||
void LiftoffAssembler::emit_f32x4_demote_f64x2_zero(LiftoffRegister dst,
|
void LiftoffAssembler::emit_f32x4_demote_f64x2_zero(LiftoffRegister dst,
|
||||||
LiftoffRegister src) {
|
LiftoffRegister src) {
|
||||||
bailout(kSimd, "f32x4.demote_f64x2_zero");
|
Cvtpd2ps(dst.fp(), src.fp());
|
||||||
}
|
}
|
||||||
|
|
||||||
void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
|
void LiftoffAssembler::emit_i8x16_sconvert_i16x8(LiftoffRegister dst,
|
||||||
@ -4483,12 +4484,14 @@ void LiftoffAssembler::emit_i32x4_uconvert_i16x8_high(LiftoffRegister dst,
|
|||||||
|
|
||||||
void LiftoffAssembler::emit_i32x4_trunc_sat_f64x2_s_zero(LiftoffRegister dst,
|
void LiftoffAssembler::emit_i32x4_trunc_sat_f64x2_s_zero(LiftoffRegister dst,
|
||||||
LiftoffRegister src) {
|
LiftoffRegister src) {
|
||||||
bailout(kSimd, "i32x4.trunc_sat_f64x2_s_zero");
|
Register tmp = GetUnusedRegister(kGpReg, {}).gp();
|
||||||
|
I32x4TruncSatF64x2SZero(dst.fp(), src.fp(), liftoff::kScratchDoubleReg, tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LiftoffAssembler::emit_i32x4_trunc_sat_f64x2_u_zero(LiftoffRegister dst,
|
void LiftoffAssembler::emit_i32x4_trunc_sat_f64x2_u_zero(LiftoffRegister dst,
|
||||||
LiftoffRegister src) {
|
LiftoffRegister src) {
|
||||||
bailout(kSimd, "i32x4.trunc_sat_f64x2_u_zero");
|
Register tmp = GetUnusedRegister(kGpReg, {}).gp();
|
||||||
|
I32x4TruncSatF64x2UZero(dst.fp(), src.fp(), liftoff::kScratchDoubleReg, tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LiftoffAssembler::emit_s128_and_not(LiftoffRegister dst,
|
void LiftoffAssembler::emit_s128_and_not(LiftoffRegister dst,
|
||||||
|
Loading…
Reference in New Issue
Block a user