1
1
mirror of https://github.com/rui314/mold.git synced 2024-09-11 13:06:59 +03:00
mold/elf/arch-ppc32.cc
2023-11-22 20:56:21 +09:00

454 lines
13 KiB
C++

// This file implements the PowerPC 32-bit ISA. For 64-bit PowerPC, see
// arch-ppc64v1.cpp and arch-ppc64v2.cpp.
//
// PPC32 is a RISC ISA. It has 32 general-purpose registers (GPRs).
// r0, r11 and r12 are reserved for static linkers, so we can use these
// registers in PLTs and range extension thunks. In addition to that, it
// has a few special registers. Notable ones are LR which holds a return
// address and CTR which we can use to store a branch target address.
//
// It feels that the PPC32 psABI is unnecessarily complicated at first
// glance, but that is mainly stemmed from the fact that the ISA lacks
// PC-relative load/store instructions. Since machine instructions cannot
// load data relative to its own address, it is not straightforward to
// support position-independent code (PIC) on PPC32.
//
// A position-independent function typically contains the following code
// in the prologue to obtain its own address:
//
// mflr r0 // save the current return address to %r0
// bcl 20, 31, 4 // call the next instruction as if it were a function
// mtlr r12 // save the return address to %r12
// mtlr r0 // restore the original return address
//
// An object file compiled with -fPIC contains a data section named
// `.got2` to store addresses of locally-defined global variables and
// constants. A PIC function usually computes its .got2+0x8000 and set it
// to %r30. This scheme allows the function to access global objects
// defined in the same input file with a single %r30-relative load/store
// instruction with a 16-bit offset, given that .got2 is smaller than
// 0x10000 (or 65536) bytes.
//
// Since each object file has its own .got2, %r30 refers to different
// places in a merged .got2 for two functions that came from different
// input files. Therefore, %r30 makes sense only within a single function.
//
// Technically, we can reuse a %r30 value in our PLT if we create a PLT
// _for each input file_ (that's what GNU ld seems to be doing), but that
// doesn't seems to be worth its complexity. Our PLT simply doesn't rely
// on a %r30 value.
//
// https://github.com/rui314/psabi/blob/main/ppc32.pdf
#include "mold.h"
namespace mold::elf {
using E = PPC32;
static u64 lo(u64 x) { return x & 0xffff; }
static u64 hi(u64 x) { return x >> 16; }
static u64 ha(u64 x) { return (x + 0x8000) >> 16; }
static u64 high(u64 x) { return (x >> 16) & 0xffff; }
static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
static const ub32 insn[] = {
// Get the address of this PLT section
0x7c08'02a6, // mflr r0
0x429f'0005, // bcl 20, 31, 4
0x7d88'02a6, // 1: mflr r12
0x7c08'03a6, // mtlr r0
// Compute the runtime address of GOTPLT+12
0x3d8c'0000, // addis r12, r12, (GOTPLT - 1b)@higha
0x398c'0000, // addi r12, r12, (GOTPLT - 1b)@lo
// Compute the PLT entry offset
0x7d6c'5850, // sub r11, r11, r12
0x1d6b'0003, // mulli r11, r11, 3
// Load GOTPLT[2] and branch to GOTPLT[1]
0x800c'fff8, // lwz r0, -8(r12)
0x7c09'03a6, // mtctr r0
0x818c'fffc, // lwz r12, -4(r12)
0x4e80'0420, // bctr
0x6000'0000, // nop
0x6000'0000, // nop
0x6000'0000, // nop
0x6000'0000, // nop
};
static_assert(sizeof(insn) == E::plt_hdr_size);
memcpy(buf, insn, sizeof(insn));
ub32 *loc = (ub32 *)buf;
loc[4] |= higha(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4);
loc[5] |= lo(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4);
}
static const ub32 plt_entry[] = {
// Get the address of this PLT entry
0x7c08'02a6, // mflr r0
0x429f'0005, // bcl 20, 31, 4
0x7d88'02a6, // mflr r12
0x7c08'03a6, // mtlr r0
// Load an address from the GOT/GOTPLT entry and jump to that address
0x3d6c'0000, // addis r11, r12, OFFSET@higha
0x396b'0000, // addi r11, r11, OFFSET@lo
0x818b'0000, // lwz r12, 0(r11)
0x7d89'03a6, // mtctr r12
0x4e80'0420, // bctr
};
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static_assert(E::plt_size == sizeof(plt_entry));
memcpy(buf, plt_entry, sizeof(plt_entry));
ub32 *loc = (ub32 *)buf;
i64 offset = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 8;
loc[4] |= higha(offset);
loc[5] |= lo(offset);
}
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static_assert(E::pltgot_size == sizeof(plt_entry));
memcpy(buf, plt_entry, sizeof(plt_entry));
ub32 *loc = (ub32 *)buf;
i64 offset = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 8;
loc[4] |= higha(offset);
loc[5] |= lo(offset);
}
template <>
void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_PPC_ADDR32:
*(ub32 *)loc = val;
break;
case R_PPC_REL32:
*(ub32 *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
u64 GOT2 = file.extra.got2 ? file.extra.got2->get_addr() : 0;
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = get_addr() + rel.r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
switch (rel.r_type) {
case R_PPC_ADDR32:
case R_PPC_UADDR32:
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
break;
case R_PPC_ADDR14:
*(ub32 *)loc |= bits(S + A, 15, 2) << 2;
break;
case R_PPC_ADDR16:
case R_PPC_UADDR16:
case R_PPC_ADDR16_LO:
*(ub16 *)loc = lo(S + A);
break;
case R_PPC_ADDR16_HI:
*(ub16 *)loc = hi(S + A);
break;
case R_PPC_ADDR16_HA:
*(ub16 *)loc = ha(S + A);
break;
case R_PPC_ADDR24:
*(ub32 *)loc |= bits(S + A, 25, 2) << 2;
break;
case R_PPC_ADDR30:
*(ub32 *)loc |= bits(S + A, 31, 2) << 2;
break;
case R_PPC_PLT16_LO:
*(ub16 *)loc = lo(G + GOT - A - GOT2);
break;
case R_PPC_PLT16_HI:
*(ub16 *)loc = hi(G + GOT - A - GOT2);
break;
case R_PPC_PLT16_HA:
*(ub16 *)loc = ha(G + GOT - A - GOT2);
break;
case R_PPC_PLT32:
*(ub32 *)loc = G + GOT - A - GOT2;
break;
case R_PPC_REL14:
*(ub32 *)loc |= bits(S + A - P, 15, 2) << 2;
break;
case R_PPC_REL16:
case R_PPC_REL16_LO:
*(ub16 *)loc = lo(S + A - P);
break;
case R_PPC_REL16_HI:
*(ub16 *)loc = hi(S + A - P);
break;
case R_PPC_REL16_HA:
*(ub16 *)loc = ha(S + A - P);
break;
case R_PPC_REL24:
case R_PPC_LOCAL24PC: {
i64 val = S + A - P;
if (sign_extend(val, 25) != val)
val = get_thunk_addr(i) - P;
*(ub32 *)loc |= bits(val, 25, 2) << 2;
break;
}
case R_PPC_PLTREL24: {
i64 val = S - P;
if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
val = get_thunk_addr(i) - P;
*(ub32 *)loc |= bits(val, 25, 2) << 2;
break;
}
case R_PPC_REL32:
case R_PPC_PLTREL32:
*(ub32 *)loc = S + A - P;
break;
case R_PPC_GOT16:
case R_PPC_GOT16_LO:
*(ub16 *)loc = lo(G + A);
break;
case R_PPC_GOT16_HI:
*(ub16 *)loc = hi(G + A);
break;
case R_PPC_GOT16_HA:
*(ub16 *)loc = ha(G + A);
break;
case R_PPC_TPREL16_LO:
*(ub16 *)loc = lo(S + A - ctx.tp_addr);
break;
case R_PPC_TPREL16_HI:
*(ub16 *)loc = hi(S + A - ctx.tp_addr);
break;
case R_PPC_TPREL16_HA:
*(ub16 *)loc = ha(S + A - ctx.tp_addr);
break;
case R_PPC_DTPREL16_LO:
*(ub16 *)loc = lo(S + A - ctx.dtp_addr);
break;
case R_PPC_DTPREL16_HI:
*(ub16 *)loc = hi(S + A - ctx.dtp_addr);
break;
case R_PPC_DTPREL16_HA:
*(ub16 *)loc = ha(S + A - ctx.dtp_addr);
break;
case R_PPC_GOT_TLSGD16:
*(ub16 *)loc = sym.get_tlsgd_addr(ctx) - GOT;
break;
case R_PPC_GOT_TLSLD16:
*(ub16 *)loc = ctx.got->get_tlsld_addr(ctx) - GOT;
break;
case R_PPC_GOT_TPREL16:
*(ub16 *)loc = sym.get_gottp_addr(ctx) - GOT;
break;
case R_PPC_TLS:
case R_PPC_TLSGD:
case R_PPC_TLSLD:
case R_PPC_PLTSEQ:
case R_PPC_PLTCALL:
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : (i64)rel.r_addend;
switch (rel.r_type) {
case R_PPC_ADDR32:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ub32 *)loc = *val;
else
*(ub32 *)loc = S + A;
break;
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
sym.flags |= NEEDS_GOT | NEEDS_PLT;
switch (rel.r_type) {
case R_PPC_ADDR32:
case R_PPC_UADDR32:
scan_dyn_absrel(ctx, sym, rel);
break;
case R_PPC_ADDR14:
case R_PPC_ADDR16:
case R_PPC_UADDR16:
case R_PPC_ADDR16_LO:
case R_PPC_ADDR16_HI:
case R_PPC_ADDR16_HA:
case R_PPC_ADDR24:
case R_PPC_ADDR30:
scan_absrel(ctx, sym, rel);
break;
case R_PPC_REL14:
case R_PPC_REL16:
case R_PPC_REL16_LO:
case R_PPC_REL16_HI:
case R_PPC_REL16_HA:
case R_PPC_REL32:
scan_pcrel(ctx, sym, rel);
break;
case R_PPC_GOT16:
case R_PPC_GOT16_LO:
case R_PPC_GOT16_HI:
case R_PPC_GOT16_HA:
case R_PPC_PLT16_LO:
case R_PPC_PLT16_HI:
case R_PPC_PLT16_HA:
case R_PPC_PLT32:
sym.flags |= NEEDS_GOT;
break;
case R_PPC_REL24:
case R_PPC_PLTREL24:
case R_PPC_PLTREL32:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_PPC_GOT_TLSGD16:
sym.flags |= NEEDS_TLSGD;
break;
case R_PPC_GOT_TLSLD16:
ctx.needs_tlsld = true;
break;
case R_PPC_GOT_TPREL16:
sym.flags |= NEEDS_GOTTP;
break;
case R_PPC_TPREL16_LO:
case R_PPC_TPREL16_HI:
case R_PPC_TPREL16_HA:
check_tlsle(ctx, sym, rel);
break;
case R_PPC_LOCAL24PC:
case R_PPC_TLS:
case R_PPC_TLSGD:
case R_PPC_TLSLD:
case R_PPC_DTPREL16_LO:
case R_PPC_DTPREL16_HI:
case R_PPC_DTPREL16_HA:
case R_PPC_PLTSEQ:
case R_PPC_PLTCALL:
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
template <>
void Thunk<E>::copy_buf(Context<E> &ctx) {
static const ub32 local_thunk[] = {
// Get this thunk's address
0x7c08'02a6, // mflr r0
0x429f'0005, // bcl 20, 31, 4
0x7d88'02a6, // mflr r12
0x7c08'03a6, // mtlr r0
// Materialize the destination's address in %r11 and jump to that address
0x3d6c'0000, // addis r11, r12, OFFSET@higha
0x396b'0000, // addi r11, r11, OFFSET@lo
0x7d69'03a6, // mtctr r11
0x4e80'0420, // bctr
0x6000'0000, // nop
};
static_assert(E::thunk_size == sizeof(plt_entry));
static_assert(E::thunk_size == sizeof(local_thunk));
u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
u64 P = output_section.shdr.sh_addr + offset;
for (Symbol<E> *sym : symbols) {
if (sym->has_plt(ctx)) {
u64 got =
sym->has_got(ctx) ? sym->get_got_addr(ctx) : sym->get_gotplt_addr(ctx);
i64 val = got - P - 8;
memcpy(buf, plt_entry, sizeof(plt_entry));
*(ub32 *)(buf + 16) |= higha(val);
*(ub32 *)(buf + 20) |= lo(val);
} else {
i64 val = sym->get_addr(ctx) - P - 8;
memcpy(buf, local_thunk, sizeof(local_thunk));
*(ub32 *)(buf + 16) |= higha(val);
*(ub32 *)(buf + 20) |= lo(val);
}
buf += E::thunk_size;
P += E::thunk_size;
}
}
} // namespace mold::elf