sparc: Use ba,a,pt in PLTs and fix bugs in R_SPARC_JMP_IREL handling.

2010-03-03 David S. Miller <davem@davemloft.net> * sysdeps/sparc/sparc32/dl-machine.h (elf_machine_lazy_rel): Must pass '1' for 't' argument to sparc_fixup_plt. * sysdeps/sparc/sparc64/dl-machine.h (elf_machine_lazy_rel): Likewise. * sysdeps/sparc/sparc32/dl-plt.h (OPCODE_BA_PT): Define. (sparc_fixup_plt): Document 't' argument. Enable branch optimization and use v9 branches when possible. Explain why we cannot unconditionally patch the branch into the first PLT instruction. * sysdeps/sparc/sparc64/dl-plt.h (sparc64_fixup_plt): Document 't' argument. Use v9 branches when possible. Explain why we can in fact unconditionally use a branch in the first PLT instruction here.
2024-12-23 11:20:07 +00:00 · 2010-03-03 02:10:22 -08:00 · 2010-03-03 02:10:22 -08:00 · 7ec1221ff7
commit 7ec1221ff7
parent 42488a4d31
5 changed files with 83 additions and 13 deletions
--- a/16
+++ b/16
@ -1,3 +1,19 @@
+2010-03-03  David S. Miller  <davem@davemloft.net>
+
+	* sysdeps/sparc/sparc32/dl-machine.h (elf_machine_lazy_rel): Must
+	pass '1' for 't' argument to sparc_fixup_plt.
+	* sysdeps/sparc/sparc64/dl-machine.h (elf_machine_lazy_rel):
+	Likewise.
+	* sysdeps/sparc/sparc32/dl-plt.h (OPCODE_BA_PT): Define.
+	(sparc_fixup_plt): Document 't' argument.  Enable branch
+	optimization and use v9 branches when possible.  Explain why we
+	cannot unconditionally patch the branch into the first PLT
+	instruction.
+	* sysdeps/sparc/sparc64/dl-plt.h (sparc64_fixup_plt): Document 't'
+	argument.  Use v9 branches when possible.  Explain why we can in
+	fact unconditionally use a branch in the first PLT instruction
+	here.
+
 2010-02-28  Roland McGrath  <roland@redhat.com>

 	* elf/elf.h (NT_X86_XSTATE): New macro.
--- a/sysdeps/sparc/sparc32/dl-machine.h
+++ b/sysdeps/sparc/sparc32/dl-machine.h
@ -563,7 +563,7 @@ elf_machine_lazy_rel (struct link_map *map,
    {
      Elf32_Addr value = map->l_addr + reloc->r_addend;
      value = ((Elf32_Addr (*) (void)) value) ();
-      sparc_fixup_plt (reloc, reloc_addr, value, 0, 1);
+      sparc_fixup_plt (reloc, reloc_addr, value, 1, 1);
    }
  else if (r_type == R_SPARC_NONE)
    ;
--- a/sysdeps/sparc/sparc32/dl-plt.h
+++ b/sysdeps/sparc/sparc32/dl-plt.h
@ -25,19 +25,55 @@
 #define OPCODE_JMP_G1	0x81c06000 /* jmp %g1+?; add lo 10 bits of value */
 #define OPCODE_SAVE_SP	0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */
 #define OPCODE_BA	0x30800000 /* b,a ?; add PC-rel word address */
+#define OPCODE_BA_PT	0x30480000 /* ba,a,pt %icc, ?; add PC-rel word address */

 static inline __attribute__ ((always_inline)) Elf32_Addr
 sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
 		 Elf32_Addr value, int t, int do_flush)
 {
-  Elf32_Sword disp = value - (Elf32_Addr) reloc_addr;
+  Elf32_Sword disp;

-  if (0 && disp >= -0x800000 && disp < 0x800000)
+  /* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap,
+     in which case we'll be resolving all PLT entries and thus can
+     optimize by overwriting instructions starting at the first PLT entry
+     instruction and we need not be mindful of thread safety.
+
+     Otherwise, 't' is '1'.  */
+  reloc_addr += t;
+  disp = value - (Elf32_Addr) reloc_addr;
+
+  if (disp >= -0x800000 && disp < 0x800000)
    {
-      /* Don't need to worry about thread safety. We're writing just one
-	 instruction.  */
+      unsigned int insn = OPCODE_BA | ((disp >> 2) & 0x3fffff);

-      reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff);
+#ifdef __sparc_v9__
+      /* On V9 we can do even better by using a branch with
+	 prediction if we fit into the even smaller 19-bit
+	 displacement field.  */
+      if (disp >= -0x100000 && disp < 0x100000)
+	insn = OPCODE_BA_PT | ((disp >> 2) & 0x07ffff);
+#endif
+
+      /* Even if we are writing just a single branch, we must not
+	 ignore the 't' offset.  Consider a case where we have some
+	 PLT slots which can be optimized into a single branch and
+	 some which cannot.  Then we can end up with a PLT which looks
+	 like:
+
+		PLT4.0: sethi	%(PLT_4_INDEX), %g1
+			sethi	%(fully_resolved_sym_4), %g1
+			jmp	%g1 + %lo(fully_resolved_sym_4)
+		PLT5.0:	ba,a	fully_resolved_sym_5
+			ba,a	PLT0.0
+			...
+
+	  The delay slot of that jmp must always be either a sethi to
+	  %g1 or a nop.  But if we try to place this displacement
+	  branch there, PLT4.0 will jump to fully_resolved_sym_4 for 1
+	  instruction and then go immediately to
+	  fully_resolved_sym_5.  */
+
+      reloc_addr[0] = insn;
      if (do_flush)
 	__asm __volatile ("flush %0" : : "r"(reloc_addr));
    }
@ -48,7 +84,6 @@ sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
 	 need not be done during bootstrapping, since there are no threads.
 	 But we also can't tell if we _can_ use flush, so don't. */

-      reloc_addr += t;
      reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff);
      if (do_flush)
 	__asm __volatile ("flush %0+4" : : "r"(reloc_addr));
--- a/sysdeps/sparc/sparc64/dl-machine.h
+++ b/sysdeps/sparc/sparc64/dl-machine.h
@ -661,7 +661,7 @@ elf_machine_lazy_rel (struct link_map *map,
 	{
 	  /* 'high' is always zero, for large PLT entries the linker
 	     emits an R_SPARC_IRELATIVE.  */
-	  sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 0);
+	  sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 1);
 	}
      else
 	*reloc_addr = value;
--- a/sysdeps/sparc/sparc64/dl-plt.h
+++ b/sysdeps/sparc/sparc64/dl-plt.h
@ -28,7 +28,14 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
  Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
  Elf64_Sxword disp = value - plt_vaddr;

-  /* Now move plt_vaddr up to the call instruction.  */
+  /* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap,
+     in which case we'll be resolving all PLT entries and thus can
+     optimize by overwriting instructions starting at the first PLT entry
+     instruction and we need not be mindful of thread safety.
+
+     Otherwise, 't' is '1'.
+
+     Now move plt_vaddr up to the call instruction.  */
  plt_vaddr += ((t + 1) * 4);

  /* PLT entries .PLT32768 and above look always the same.  */
@ -39,10 +46,22 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
  /* Near destination.  */
  else if (disp >= -0x800000 && disp < 0x800000)
    {
-      /* As this is just one instruction, it is thread safe and so
-	 we can avoid the unnecessary sethi FOO, %g1.
-	 b,a target  */
-      insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff);
+      unsigned int insn;
+
+      /* ba,a */
+      insn = 0x30800000 | ((disp >> 2) & 0x3fffff);
+
+      if (disp >= -0x100000 && disp < 0x100000)
+	{
+	  /* ba,a,pt %icc */
+	  insn = 0x30480000  | ((disp >> 2) & 0x07ffff);
+	}
+
+      /* As this is just one instruction, it is thread safe and so we
+	 can avoid the unnecessary sethi FOO, %g1.  Each 64-bit PLT
+	 entry is 8 instructions long, so we can't run into the 'jmp'
+	 delay slot problems 32-bit PLTs can.  */
+      insns[0] = insn;
      __asm __volatile ("flush %0" : : "r" (insns));
    }
  /* 32-bit Sparc style, the target is in the lower 32-bits of