mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-22 21:10:07 +00:00
sparc: Use ba,a,pt in PLTs and fix bugs in R_SPARC_JMP_IREL handling.
2010-03-03 David S. Miller <davem@davemloft.net> * sysdeps/sparc/sparc32/dl-machine.h (elf_machine_lazy_rel): Must pass '1' for 't' argument to sparc_fixup_plt. * sysdeps/sparc/sparc64/dl-machine.h (elf_machine_lazy_rel): Likewise. * sysdeps/sparc/sparc32/dl-plt.h (OPCODE_BA_PT): Define. (sparc_fixup_plt): Document 't' argument. Enable branch optimization and use v9 branches when possible. Explain why we cannot unconditionally patch the branch into the first PLT instruction. * sysdeps/sparc/sparc64/dl-plt.h (sparc64_fixup_plt): Document 't' argument. Use v9 branches when possible. Explain why we can in fact unconditionally use a branch in the first PLT instruction here.
This commit is contained in:
parent
42488a4d31
commit
7ec1221ff7
16
ChangeLog
16
ChangeLog
@ -1,3 +1,19 @@
|
||||
2010-03-03 David S. Miller <davem@davemloft.net>
|
||||
|
||||
* sysdeps/sparc/sparc32/dl-machine.h (elf_machine_lazy_rel): Must
|
||||
pass '1' for 't' argument to sparc_fixup_plt.
|
||||
* sysdeps/sparc/sparc64/dl-machine.h (elf_machine_lazy_rel):
|
||||
Likewise.
|
||||
* sysdeps/sparc/sparc32/dl-plt.h (OPCODE_BA_PT): Define.
|
||||
(sparc_fixup_plt): Document 't' argument. Enable branch
|
||||
optimization and use v9 branches when possible. Explain why we
|
||||
cannot unconditionally patch the branch into the first PLT
|
||||
instruction.
|
||||
* sysdeps/sparc/sparc64/dl-plt.h (sparc64_fixup_plt): Document 't'
|
||||
argument. Use v9 branches when possible. Explain why we can in
|
||||
fact unconditionally use a branch in the first PLT instruction
|
||||
here.
|
||||
|
||||
2010-02-28 Roland McGrath <roland@redhat.com>
|
||||
|
||||
* elf/elf.h (NT_X86_XSTATE): New macro.
|
||||
|
@ -563,7 +563,7 @@ elf_machine_lazy_rel (struct link_map *map,
|
||||
{
|
||||
Elf32_Addr value = map->l_addr + reloc->r_addend;
|
||||
value = ((Elf32_Addr (*) (void)) value) ();
|
||||
sparc_fixup_plt (reloc, reloc_addr, value, 0, 1);
|
||||
sparc_fixup_plt (reloc, reloc_addr, value, 1, 1);
|
||||
}
|
||||
else if (r_type == R_SPARC_NONE)
|
||||
;
|
||||
|
@ -25,19 +25,55 @@
|
||||
#define OPCODE_JMP_G1 0x81c06000 /* jmp %g1+?; add lo 10 bits of value */
|
||||
#define OPCODE_SAVE_SP 0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */
|
||||
#define OPCODE_BA 0x30800000 /* b,a ?; add PC-rel word address */
|
||||
#define OPCODE_BA_PT 0x30480000 /* ba,a,pt %icc, ?; add PC-rel word address */
|
||||
|
||||
static inline __attribute__ ((always_inline)) Elf32_Addr
|
||||
sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
|
||||
Elf32_Addr value, int t, int do_flush)
|
||||
{
|
||||
Elf32_Sword disp = value - (Elf32_Addr) reloc_addr;
|
||||
Elf32_Sword disp;
|
||||
|
||||
if (0 && disp >= -0x800000 && disp < 0x800000)
|
||||
/* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap,
|
||||
in which case we'll be resolving all PLT entries and thus can
|
||||
optimize by overwriting instructions starting at the first PLT entry
|
||||
instruction and we need not be mindful of thread safety.
|
||||
|
||||
Otherwise, 't' is '1'. */
|
||||
reloc_addr += t;
|
||||
disp = value - (Elf32_Addr) reloc_addr;
|
||||
|
||||
if (disp >= -0x800000 && disp < 0x800000)
|
||||
{
|
||||
/* Don't need to worry about thread safety. We're writing just one
|
||||
instruction. */
|
||||
unsigned int insn = OPCODE_BA | ((disp >> 2) & 0x3fffff);
|
||||
|
||||
reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff);
|
||||
#ifdef __sparc_v9__
|
||||
/* On V9 we can do even better by using a branch with
|
||||
prediction if we fit into the even smaller 19-bit
|
||||
displacement field. */
|
||||
if (disp >= -0x100000 && disp < 0x100000)
|
||||
insn = OPCODE_BA_PT | ((disp >> 2) & 0x07ffff);
|
||||
#endif
|
||||
|
||||
/* Even if we are writing just a single branch, we must not
|
||||
ignore the 't' offset. Consider a case where we have some
|
||||
PLT slots which can be optimized into a single branch and
|
||||
some which cannot. Then we can end up with a PLT which looks
|
||||
like:
|
||||
|
||||
PLT4.0: sethi %(PLT_4_INDEX), %g1
|
||||
sethi %(fully_resolved_sym_4), %g1
|
||||
jmp %g1 + %lo(fully_resolved_sym_4)
|
||||
PLT5.0: ba,a fully_resolved_sym_5
|
||||
ba,a PLT0.0
|
||||
...
|
||||
|
||||
The delay slot of that jmp must always be either a sethi to
|
||||
%g1 or a nop. But if we try to place this displacement
|
||||
branch there, PLT4.0 will jump to fully_resolved_sym_4 for 1
|
||||
instruction and then go immediately to
|
||||
fully_resolved_sym_5. */
|
||||
|
||||
reloc_addr[0] = insn;
|
||||
if (do_flush)
|
||||
__asm __volatile ("flush %0" : : "r"(reloc_addr));
|
||||
}
|
||||
@ -48,7 +84,6 @@ sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
|
||||
need not be done during bootstrapping, since there are no threads.
|
||||
But we also can't tell if we _can_ use flush, so don't. */
|
||||
|
||||
reloc_addr += t;
|
||||
reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff);
|
||||
if (do_flush)
|
||||
__asm __volatile ("flush %0+4" : : "r"(reloc_addr));
|
||||
|
@ -661,7 +661,7 @@ elf_machine_lazy_rel (struct link_map *map,
|
||||
{
|
||||
/* 'high' is always zero, for large PLT entries the linker
|
||||
emits an R_SPARC_IRELATIVE. */
|
||||
sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 0);
|
||||
sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 1);
|
||||
}
|
||||
else
|
||||
*reloc_addr = value;
|
||||
|
@ -28,7 +28,14 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
|
||||
Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
|
||||
Elf64_Sxword disp = value - plt_vaddr;
|
||||
|
||||
/* Now move plt_vaddr up to the call instruction. */
|
||||
/* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap,
|
||||
in which case we'll be resolving all PLT entries and thus can
|
||||
optimize by overwriting instructions starting at the first PLT entry
|
||||
instruction and we need not be mindful of thread safety.
|
||||
|
||||
Otherwise, 't' is '1'.
|
||||
|
||||
Now move plt_vaddr up to the call instruction. */
|
||||
plt_vaddr += ((t + 1) * 4);
|
||||
|
||||
/* PLT entries .PLT32768 and above look always the same. */
|
||||
@ -39,10 +46,22 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
|
||||
/* Near destination. */
|
||||
else if (disp >= -0x800000 && disp < 0x800000)
|
||||
{
|
||||
/* As this is just one instruction, it is thread safe and so
|
||||
we can avoid the unnecessary sethi FOO, %g1.
|
||||
b,a target */
|
||||
insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff);
|
||||
unsigned int insn;
|
||||
|
||||
/* ba,a */
|
||||
insn = 0x30800000 | ((disp >> 2) & 0x3fffff);
|
||||
|
||||
if (disp >= -0x100000 && disp < 0x100000)
|
||||
{
|
||||
/* ba,a,pt %icc */
|
||||
insn = 0x30480000 | ((disp >> 2) & 0x07ffff);
|
||||
}
|
||||
|
||||
/* As this is just one instruction, it is thread safe and so we
|
||||
can avoid the unnecessary sethi FOO, %g1. Each 64-bit PLT
|
||||
entry is 8 instructions long, so we can't run into the 'jmp'
|
||||
delay slot problems 32-bit PLTs can. */
|
||||
insns[0] = insn;
|
||||
__asm __volatile ("flush %0" : : "r" (insns));
|
||||
}
|
||||
/* 32-bit Sparc style, the target is in the lower 32-bits of
|
||||
|
Loading…
Reference in New Issue
Block a user