[ia32] Fix partial regression in Cvtsi2ss/Cvtsi2sd

In https://crrev.com/c/3131374 we switched some instructions to use macro-assembler functions which can handle AVX and SSE. However for Cvtsi2ss and Cvtsi2sd, the behavior subtly changed. The old behavior directly called cvtsi2ss/cvtsi2sd in the code-generator. The new behavior used the macro-assembler functions, which xor the dst operand. This led to more instructions and larger code size in some benchmarks. The xor is supposed to help reduce dependence chain length (see comments on Cvtsi2ss), but doesn't seem to have helped in this benchmark. So, partially revert the changes, and rename all affected IA32 opcodes back to SSE. Bug: chromium:1248509 Change-Id: Ie700e2980fe9ed083c1160bda3a28f64e1e43041 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3154349 Commit-Queue: Zhi An Ng <zhin@chromium.org> Reviewed-by: Adam Klein <adamk@chromium.org> Cr-Commit-Position: refs/heads/main@{#76775}
2021-09-10 14:17:02 -07:00 · 2021-09-10 14:17:02 -07:00 · 5f622d21a5
commit 5f622d21a5
parent d7c9b31a77
4 changed files with 12 additions and 10 deletions
--- a/src/compiler/backend/ia32/code-generator-ia32.cc
+++ b/src/compiler/backend/ia32/code-generator-ia32.cc
@ -1439,15 +1439,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ Cvttsd2ui(i.OutputRegister(), i.InputOperand(0),
                   i.TempSimd128Register(0));
      break;
-    case kIA32Int32ToFloat32:
-      __ Cvtsi2ss(i.OutputDoubleRegister(), i.InputOperand(0));
+    case kSSEInt32ToFloat32:
+      // Calling Cvtsi2ss (which does a xor) regresses some benchmarks.
+      __ cvtsi2ss(i.OutputDoubleRegister(), i.InputOperand(0));
      break;
    case kIA32Uint32ToFloat32:
      __ Cvtui2ss(i.OutputDoubleRegister(), i.InputOperand(0),
                  i.TempRegister(0));
      break;
-    case kIA32Int32ToFloat64:
-      __ Cvtsi2sd(i.OutputDoubleRegister(), i.InputOperand(0));
+    case kSSEInt32ToFloat64:
+      // Calling Cvtsi2sd (which does a xor) regresses some benchmarks.
+      __ cvtsi2sd(i.OutputDoubleRegister(), i.InputOperand(0));
      break;
    case kIA32Uint32ToFloat64:
      __ Cvtui2sd(i.OutputDoubleRegister(), i.InputOperand(0),
--- a/src/compiler/backend/ia32/instruction-codes-ia32.h
+++ b/src/compiler/backend/ia32/instruction-codes-ia32.h
@ -64,9 +64,9 @@ namespace compiler {
  V(IA32Float32ToUint32)           \
  V(IA32Float64ToInt32)            \
  V(IA32Float64ToUint32)           \
-  V(IA32Int32ToFloat32)            \
+  V(SSEInt32ToFloat32)             \
  V(IA32Uint32ToFloat32)           \
-  V(IA32Int32ToFloat64)            \
+  V(SSEInt32ToFloat64)             \
  V(IA32Uint32ToFloat64)           \
  V(IA32Float64ExtractLowWord32)   \
  V(IA32Float64ExtractHighWord32)  \
--- a/src/compiler/backend/ia32/instruction-scheduler-ia32.cc
+++ b/src/compiler/backend/ia32/instruction-scheduler-ia32.cc
@ -65,9 +65,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
    case kIA32Float32ToUint32:
    case kIA32Float64ToInt32:
    case kIA32Float64ToUint32:
-    case kIA32Int32ToFloat32:
+    case kSSEInt32ToFloat32:
    case kIA32Uint32ToFloat32:
-    case kIA32Int32ToFloat64:
+    case kSSEInt32ToFloat64:
    case kIA32Uint32ToFloat64:
    case kIA32Float64ExtractLowWord32:
    case kIA32Float64ExtractHighWord32:
--- a/src/compiler/backend/ia32/instruction-selector-ia32.cc
+++ b/src/compiler/backend/ia32/instruction-selector-ia32.cc
@ -1128,8 +1128,8 @@ void InstructionSelector::VisitWord32Ror(Node* node) {
  V(Word32Ctz, kIA32Tzcnt)                                   \
  V(Word32Popcnt, kIA32Popcnt)                               \
  V(ChangeFloat32ToFloat64, kIA32Float32ToFloat64)           \
-  V(RoundInt32ToFloat32, kIA32Int32ToFloat32)                \
-  V(ChangeInt32ToFloat64, kIA32Int32ToFloat64)               \
+  V(RoundInt32ToFloat32, kSSEInt32ToFloat32)                 \
+  V(ChangeInt32ToFloat64, kSSEInt32ToFloat64)                \
  V(TruncateFloat32ToInt32, kIA32Float32ToInt32)             \
  V(ChangeFloat64ToInt32, kIA32Float64ToInt32)               \
  V(TruncateFloat64ToFloat32, kIA32Float64ToFloat32)         \