sparc: Use ba,a,pt in PLTs and fix bugs in R_SPARC_JMP_IREL handling.

2010-03-03  David S. Miller  <davem@davemloft.net>

	* sysdeps/sparc/sparc32/dl-machine.h (elf_machine_lazy_rel): Must
	pass '1' for 't' argument to sparc_fixup_plt.
	* sysdeps/sparc/sparc64/dl-machine.h (elf_machine_lazy_rel):
	Likewise.
	* sysdeps/sparc/sparc32/dl-plt.h (OPCODE_BA_PT): Define.
	(sparc_fixup_plt): Document 't' argument.  Enable branch
	optimization and use v9 branches when possible.  Explain why we
	cannot unconditionally patch the branch into the first PLT
	instruction.
	* sysdeps/sparc/sparc64/dl-plt.h (sparc64_fixup_plt): Document 't'
	argument.  Use v9 branches when possible.  Explain why we can in
	fact unconditionally use a branch in the first PLT instruction
	here.
This commit is contained in:
David S. Miller 2010-03-03 02:10:22 -08:00
parent 42488a4d31
commit 7ec1221ff7
5 changed files with 83 additions and 13 deletions

View File

@ -1,3 +1,19 @@
2010-03-03 David S. Miller <davem@davemloft.net>
* sysdeps/sparc/sparc32/dl-machine.h (elf_machine_lazy_rel): Must
pass '1' for 't' argument to sparc_fixup_plt.
* sysdeps/sparc/sparc64/dl-machine.h (elf_machine_lazy_rel):
Likewise.
* sysdeps/sparc/sparc32/dl-plt.h (OPCODE_BA_PT): Define.
(sparc_fixup_plt): Document 't' argument. Enable branch
optimization and use v9 branches when possible. Explain why we
cannot unconditionally patch the branch into the first PLT
instruction.
* sysdeps/sparc/sparc64/dl-plt.h (sparc64_fixup_plt): Document 't'
argument. Use v9 branches when possible. Explain why we can in
fact unconditionally use a branch in the first PLT instruction
here.
2010-02-28 Roland McGrath <roland@redhat.com>
* elf/elf.h (NT_X86_XSTATE): New macro.

View File

@ -563,7 +563,7 @@ elf_machine_lazy_rel (struct link_map *map,
{
Elf32_Addr value = map->l_addr + reloc->r_addend;
value = ((Elf32_Addr (*) (void)) value) ();
sparc_fixup_plt (reloc, reloc_addr, value, 0, 1);
sparc_fixup_plt (reloc, reloc_addr, value, 1, 1);
}
else if (r_type == R_SPARC_NONE)
;

View File

@ -25,19 +25,55 @@
#define OPCODE_JMP_G1 0x81c06000 /* jmp %g1+?; add lo 10 bits of value */
#define OPCODE_SAVE_SP 0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */
#define OPCODE_BA 0x30800000 /* b,a ?; add PC-rel word address */
#define OPCODE_BA_PT 0x30480000 /* ba,a,pt %icc, ?; add PC-rel word address */
static inline __attribute__ ((always_inline)) Elf32_Addr
sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
Elf32_Addr value, int t, int do_flush)
{
Elf32_Sword disp = value - (Elf32_Addr) reloc_addr;
Elf32_Sword disp;
if (0 && disp >= -0x800000 && disp < 0x800000)
/* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap,
in which case we'll be resolving all PLT entries and thus can
optimize by overwriting instructions starting at the first PLT entry
instruction and we need not be mindful of thread safety.
Otherwise, 't' is '1'. */
reloc_addr += t;
disp = value - (Elf32_Addr) reloc_addr;
if (disp >= -0x800000 && disp < 0x800000)
{
/* Don't need to worry about thread safety. We're writing just one
instruction. */
unsigned int insn = OPCODE_BA | ((disp >> 2) & 0x3fffff);
reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff);
#ifdef __sparc_v9__
/* On V9 we can do even better by using a branch with
prediction if we fit into the even smaller 19-bit
displacement field. */
if (disp >= -0x100000 && disp < 0x100000)
insn = OPCODE_BA_PT | ((disp >> 2) & 0x07ffff);
#endif
/* Even if we are writing just a single branch, we must not
ignore the 't' offset. Consider a case where we have some
PLT slots which can be optimized into a single branch and
some which cannot. Then we can end up with a PLT which looks
like:
PLT4.0: sethi %(PLT_4_INDEX), %g1
sethi %(fully_resolved_sym_4), %g1
jmp %g1 + %lo(fully_resolved_sym_4)
PLT5.0: ba,a fully_resolved_sym_5
ba,a PLT0.0
...
The delay slot of that jmp must always be either a sethi to
%g1 or a nop. But if we try to place this displacement
branch there, PLT4.0 will jump to fully_resolved_sym_4 for 1
instruction and then go immediately to
fully_resolved_sym_5. */
reloc_addr[0] = insn;
if (do_flush)
__asm __volatile ("flush %0" : : "r"(reloc_addr));
}
@ -48,7 +84,6 @@ sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
need not be done during bootstrapping, since there are no threads.
But we also can't tell if we _can_ use flush, so don't. */
reloc_addr += t;
reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff);
if (do_flush)
__asm __volatile ("flush %0+4" : : "r"(reloc_addr));

View File

@ -661,7 +661,7 @@ elf_machine_lazy_rel (struct link_map *map,
{
/* 'high' is always zero, for large PLT entries the linker
emits an R_SPARC_IRELATIVE. */
sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 0);
sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 1);
}
else
*reloc_addr = value;

View File

@ -28,7 +28,14 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
Elf64_Sxword disp = value - plt_vaddr;
/* Now move plt_vaddr up to the call instruction. */
/* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap,
in which case we'll be resolving all PLT entries and thus can
optimize by overwriting instructions starting at the first PLT entry
instruction and we need not be mindful of thread safety.
Otherwise, 't' is '1'.
Now move plt_vaddr up to the call instruction. */
plt_vaddr += ((t + 1) * 4);
/* PLT entries .PLT32768 and above look always the same. */
@ -39,10 +46,22 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
/* Near destination. */
else if (disp >= -0x800000 && disp < 0x800000)
{
/* As this is just one instruction, it is thread safe and so
we can avoid the unnecessary sethi FOO, %g1.
b,a target */
insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff);
unsigned int insn;
/* ba,a */
insn = 0x30800000 | ((disp >> 2) & 0x3fffff);
if (disp >= -0x100000 && disp < 0x100000)
{
/* ba,a,pt %icc */
insn = 0x30480000 | ((disp >> 2) & 0x07ffff);
}
/* As this is just one instruction, it is thread safe and so we
can avoid the unnecessary sethi FOO, %g1. Each 64-bit PLT
entry is 8 instructions long, so we can't run into the 'jmp'
delay slot problems 32-bit PLTs can. */
insns[0] = insn;
__asm __volatile ("flush %0" : : "r" (insns));
}
/* 32-bit Sparc style, the target is in the lower 32-bits of