1
1
mirror of https://github.com/rui314/mold.git synced 2024-09-20 17:39:56 +03:00
mold/elf/arch-arm32.cc
2022-05-24 15:52:33 +08:00

596 lines
18 KiB
C++

// ARM32 is a bit special from the linker's viewpoint because ARM
// processors support two different instruction encodings: Thumb and
// ARM (in a narrower sense). Thumb instructions are either 16 bits or
// 32 bits, while ARM instructions are all 32 bits. Feature-wise,
// thumb is a subset of ARM, so not all ARM instructions are
// representable in Thumb.
//
// ARM processors originally supported only ARM instructions. Thumb
// instructions were later added to increase code density.
//
// ARM processors runs in either ARM mode or Thumb mode. The mode can
// be switched using BX (branch and mode exchange)-family instructions.
// We need to use such instructions to, for example, call a function
// encoded in Thumb from a function encoded in ARM. Sometimes, the
// linker even has to emit an interworking thunk code to switch from
// Thumb to ARM.
//
// ARM instructions are aligned to 4 byte boundaries. Thumb are to 2
// byte boundaries.
//
// You can distinguish Thumb functions from ARM functions by looking
// at the least significant bit (LSB) of its "address". If LSB is 0,
// it's ARM; otherwise, Thumb. LSB is not a part of its real address.
// For example, if a symbol `foo` is of type STT_FUNC and has value
// 0x2001, then `foo` is a function using Thumb instructions whose
// address is 0x2000 (not 0x2001).
#include "mold.h"
#include <tbb/parallel_for.h>
#include <tbb/parallel_for_each.h>
#include <tbb/parallel_sort.h>
namespace mold::elf {
using E = ARM32;
static void write_mov_imm(u8 *loc, u32 val) {
u32 imm12 = bits(val, 11, 0);
u32 imm4 = bits(val, 15, 12);
*(ul32 *)loc = (*(ul32 *)loc & 0xfff0f000) | (imm4 << 16) | imm12;
}
static void write_thm_b_imm(u8 *loc, u32 val) {
// https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/BL--BLX--immediate-
u32 sign = bit(val, 24);
u32 I1 = bit(val, 23);
u32 I2 = bit(val, 22);
u32 J1 = !I1 ^ sign;
u32 J2 = !I2 ^ sign;
u32 imm10 = bits(val, 21, 12);
u32 imm11 = bits(val, 11, 1);
*(ul16 *)loc = (*(ul16 *)loc & 0xf800) | (sign << 10) | imm10;
*(ul16 *)(loc + 2) =
(*(ul16 *)(loc + 2) & 0xd000) | (J1 << 13) | (J2 << 11) | imm11;
}
static void write_thm_mov_imm(u8 *loc, u32 val) {
// https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/MOVT
u32 imm4 = bits(val, 15, 12);
u32 i = bit(val, 11);
u32 imm3 = bits(val, 10, 8);
u32 imm8 = bits(val, 7, 0);
*(ul16 *)loc = (*(ul16 *)loc & 0b1111'1011'1111'0000) | (i << 10) | imm4;
*(ul16 *)(loc + 2) =
((*(ul16 *)(loc + 2)) & 0b1000'1111'0000'0000) | (imm3 << 12) | imm8;
}
template <>
void PltSection<E>::copy_buf(Context<E> &ctx) {
u8 *buf = ctx.buf + this->shdr.sh_offset;
static const u32 plt0[] = {
0xe52de004, // push {lr}
0xe59fe004, // ldr lr, 2f
0xe08fe00e, // 1: add lr, pc, lr
0xe5bef008, // ldr pc, [lr, #8]!
0x00000000, // 2: .word .got.plt - 1b - 8
0xe320f000, // nop
0xe320f000, // nop
0xe320f000, // nop
};
memcpy(buf, plt0, sizeof(plt0));
*(ul32 *)(buf + 16) = ctx.gotplt->shdr.sh_addr - this->shdr.sh_addr - 16;
for (Symbol<E> *sym : symbols) {
static const u32 plt[] = {
0xe59fc004, // 1: ldr ip, 2f
0xe08cc00f, // add ip, ip, pc
0xe59cf000, // ldr pc, [ip]
0x00000000, // 2: .word sym@PLTGOT - 1b
};
u8 *ent = buf + sizeof(plt0) + sym->get_plt_idx(ctx) * sizeof(plt);
memcpy(ent, plt, sizeof(plt));
*(ul32 *)(ent + 12) = sym->get_gotplt_addr(ctx) - sym->get_plt_addr(ctx) - 12;
}
}
template <>
void PltGotSection<E>::copy_buf(Context<E> &ctx) {
u8 *buf = ctx.buf + this->shdr.sh_offset;
for (Symbol<E> *sym : symbols) {
static const u32 plt[] = {
0xe59fc004, // 1: ldr ip, 2f
0xe08cc00f, // add ip, ip, pc
0xe59cf000, // ldr pc, [ip]
0x00000000, // 2: .word sym@GOT - 1b
};
u8 *ent = buf + sym->get_pltgot_idx(ctx) * sizeof(plt);
memcpy(ent, plt, sizeof(plt));
*(ul32 *)(ent + 12) = sym->get_got_addr(ctx) - sym->get_plt_addr(ctx) - 12;
}
}
// ARM does not use .eh_frame for exception handling. Instead, it uses
// .ARM.exidx and .ARM.extab. So this function is empty.
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
ElfRel<E> *dynrel = nullptr;
std::span<const ElfRel<E>> rels = get_rels(ctx);
i64 frag_idx = 0;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_ARM_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
const SectionFragmentRef<E> *frag_ref = nullptr;
if (rel_fragments && rel_fragments[frag_idx].idx == i)
frag_ref = &rel_fragments[frag_idx++];
#define S (frag_ref ? frag_ref->frag->get_addr(ctx) : sym.get_addr(ctx))
#define A (frag_ref ? frag_ref->addend : this->get_addend(rel))
#define P (output_section->shdr.sh_addr + offset + rel.r_offset)
#define T (sym.get_addr(ctx) & 1)
#define G (sym.get_got_addr(ctx) - ctx.got->shdr.sh_addr)
#define GOT ctx.got->shdr.sh_addr
switch (rel.r_type) {
case R_ARM_ABS32:
case R_ARM_TARGET1:
if (sym.is_absolute() || !ctx.arg.pic) {
*(ul32 *)loc = S + A;
} else if (sym.is_imported) {
*dynrel++ = {P, R_ARM_ABS32, (u32)sym.get_dynsym_idx(ctx)};
} else {
if (!is_relr_reloc(ctx, rel))
*dynrel++ = {P, R_ARM_RELATIVE, 0};
*(ul32 *)loc = S + A;
}
continue;
case R_ARM_REL32:
*(ul32 *)loc = S + A - P;
continue;
case R_ARM_THM_CALL:
// THM_CALL relocation refers either BL or BLX instruction.
// They are different in only one bit. We need to use BL if
// the jump target is Thumb. Otherwise, use BLX.
if (sym.esym().is_undef_weak()) {
// On ARM, calling an weak undefined symbol jumps to the
// next instruction.
write_thm_b_imm(loc, 4);
*(ul16 *)(loc + 2) |= (1 << 12); // rewrite with BL
} else if (T) {
write_thm_b_imm(loc, S + A - P);
*(ul16 *)(loc + 2) |= (1 << 12); // rewrite with BL
} else {
write_thm_b_imm(loc, align_to(S + A - P, 4));
*(ul16 *)(loc + 2) &= ~(1 << 12); // rewrite with BLX
}
continue;
case R_ARM_BASE_PREL:
*(ul32 *)loc = GOT + A - P;
continue;
case R_ARM_GOT_PREL:
*(ul32 *)loc = G + A - P;
continue;
case R_ARM_GOT_BREL:
*(ul32 *)loc = G + A;
continue;
case R_ARM_TARGET2:
*(ul32 *)loc = GOT + G + A - P;
continue;
case R_ARM_CALL:
case R_ARM_JUMP24: {
u32 val;
if (sym.esym().is_undef_weak()) {
// On ARM, calling an weak undefined symbol jumps to the
// next instruction.
val = 4;
} else {
val = S + A - P;
}
*(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | ((val >> 2) & 0x00ff'ffff);
continue;
}
case R_ARM_THM_JUMP11: {
assert(T);
u32 val = (S + A - P) >> 1;
*(ul16 *)loc = (*(ul16 *)loc & 0xf800) | (val & 0x07ff);
continue;
}
case R_ARM_THM_JUMP24:
if (T) {
write_thm_b_imm(loc, S + A - P);
} else {
assert(sym.extra.thumb_to_arm_thunk_idx != -1);
u64 thunk_addr =
ctx.thumb_to_arm->shdr.sh_addr +
sym.extra.thumb_to_arm_thunk_idx * ThumbToArmSection::ENTRY_SIZE;
write_thm_b_imm(loc, thunk_addr - P - 4);
}
continue;
case R_ARM_MOVW_PREL_NC:
write_mov_imm(loc, ((S + A) | T) - P);
continue;
case R_ARM_MOVW_ABS_NC:
write_mov_imm(loc, (S + A) | T);
continue;
case R_ARM_THM_MOVW_PREL_NC:
write_thm_mov_imm(loc, ((S + A) | T) - P);
continue;
case R_ARM_PREL31: {
u32 val = S + A - P;
*(ul32 *)loc = (*(ul32 *)loc & 0x8000'0000) | (val & 0x7fff'ffff);
continue;
}
case R_ARM_THM_MOVW_ABS_NC:
write_thm_mov_imm(loc, (S + A) | T);
continue;
case R_ARM_MOVT_PREL:
write_mov_imm(loc, (S + A - P) >> 16);
continue;
case R_ARM_THM_MOVT_PREL:
write_thm_mov_imm(loc, (S + A - P) >> 16);
continue;
case R_ARM_MOVT_ABS:
write_mov_imm(loc, (S + A) >> 16);
continue;
case R_ARM_THM_MOVT_ABS:
write_thm_mov_imm(loc, (S + A) >> 16);
continue;
case R_ARM_TLS_GD32:
*(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - P;
continue;
case R_ARM_TLS_LDM32:
*(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - P;
continue;
case R_ARM_TLS_LDO32:
*(ul32 *)loc = S + A - ctx.tls_begin;
continue;
case R_ARM_TLS_IE32:
*(ul32 *)loc = sym.get_gottp_addr(ctx) + A - P;
continue;
case R_ARM_TLS_LE32:
*(ul32 *)loc = S + A - ctx.tls_begin + 8;
continue;
case R_ARM_TLS_GOTDESC:
if (sym.get_tlsdesc_idx(ctx) == -1)
*(ul32 *)loc = S - ctx.tls_begin + 8;
else
*(ul32 *)loc = sym.get_tlsdesc_addr(ctx) + A - P - 6;
continue;
case R_ARM_THM_TLS_CALL:
if (sym.get_tlsdesc_idx(ctx) == -1) {
// BL -> NOP
*(ul32 *)loc = 0x8000f3af;
} else {
u64 addr = ctx.tls_trampoline->shdr.sh_addr;
write_thm_b_imm(loc, align_to(addr - P - 4, 4));
*(ul16 *)(loc + 2) &= ~(1 << 12); // rewrite BL with BLX
}
continue;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
#undef S
#undef A
#undef P
#undef T
#undef G
#undef GOT
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_ARM_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
if (!sym.file) {
report_undef(ctx, file, sym);
continue;
}
SectionFragment<E> *frag;
i64 addend;
std::tie(frag, addend) = get_fragment(ctx, rel);
#define S (frag ? frag->get_addr(ctx) : sym.get_addr(ctx))
#define A (frag ? addend : this->get_addend(rel))
switch (rel.r_type) {
case R_ARM_ABS32:
if (!frag) {
if (std::optional<u64> val = get_tombstone(sym)) {
*(ul32 *)loc = *val;
break;
}
}
*(ul32 *)loc = S + A;
break;
case R_ARM_TLS_LDO32:
if (std::optional<u64> val = get_tombstone(sym))
*(ul32 *)loc = *val;
else
*(ul32 *)loc = S + A - ctx.tls_begin;
break;
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
break;
}
#undef S
#undef A
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_ARM_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (!sym.file) {
report_undef(ctx, file, sym);
continue;
}
if (sym.get_type() == STT_GNU_IFUNC) {
sym.flags |= NEEDS_GOT;
sym.flags |= NEEDS_PLT;
}
switch (rel.r_type) {
case R_ARM_ABS32:
case R_ARM_MOVT_ABS:
case R_ARM_THM_MOVT_ABS:
case R_ARM_TARGET1: {
Action table[][4] = {
// Absolute Local Imported data Imported code
{ NONE, BASEREL, DYNREL, DYNREL }, // DSO
{ NONE, BASEREL, DYNREL, DYNREL }, // PIE
{ NONE, NONE, COPYREL, CPLT }, // PDE
};
dispatch(ctx, table, i, rel, sym);
break;
}
case R_ARM_REL32:
case R_ARM_BASE_PREL:
break;
case R_ARM_THM_CALL: {
Action table[][4] = {
// Absolute Local Imported data Imported code
{ NONE, NONE, PLT, PLT }, // DSO
{ NONE, NONE, PLT, PLT }, // PIE
{ NONE, NONE, PLT, PLT }, // PDE
};
dispatch(ctx, table, i, rel, sym);
break;
}
case R_ARM_GOT_PREL:
case R_ARM_GOT_BREL:
case R_ARM_TARGET2:
sym.flags |= NEEDS_GOT;
break;
case R_ARM_CALL:
case R_ARM_JUMP24:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_ARM_THM_JUMP24:
if (sym.is_imported || sym.get_type() == STT_GNU_IFUNC)
sym.flags |= NEEDS_PLT | NEEDS_THUMB_TO_ARM_THUNK;
else if (sym.esym().st_value % 2 == 0)
sym.flags |= NEEDS_THUMB_TO_ARM_THUNK;
break;
case R_ARM_MOVT_PREL:
case R_ARM_THM_MOVT_PREL:
case R_ARM_PREL31: {
Action table[][4] = {
// Absolute Local Imported data Imported code
{ ERROR, NONE, ERROR, ERROR }, // DSO
{ ERROR, NONE, COPYREL, PLT }, // PIE
{ NONE, NONE, COPYREL, PLT }, // PDE
};
dispatch(ctx, table, i, rel, sym);
break;
}
case R_ARM_TLS_GD32:
sym.flags |= NEEDS_TLSGD;
break;
case R_ARM_TLS_LDM32:
ctx.needs_tlsld = true;
break;
case R_ARM_TLS_IE32:
sym.flags |= NEEDS_GOTTP;
break;
case R_ARM_TLS_GOTDESC:
if (!ctx.relax_tlsdesc || sym.is_imported)
sym.flags |= NEEDS_TLSDESC;
break;
case R_ARM_THM_JUMP11:
case R_ARM_MOVW_PREL_NC:
case R_ARM_MOVW_ABS_NC:
case R_ARM_THM_MOVW_PREL_NC:
case R_ARM_THM_MOVW_ABS_NC:
case R_ARM_TLS_LDO32:
case R_ARM_TLS_LE32:
case R_ARM_THM_TLS_CALL:
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
// If a function referenced by a Thumb B (branch) instruction is
// resovled to a non-thumb function, we can't directly jump from the
// thumb function to the ARM function. We can't rewrite B with BX
// because there's no such BX instruction that takes an immediate.
// BX takes only a register.
//
// In order to support such branch, we insert a small piece of code to
// the resulting executable which switches the processor mode from
// Thumb to ARM. This section contains such code.
void ThumbToArmSection::add_symbol(Context<E> &ctx, Symbol<E> *sym) {
if (sym->extra.thumb_to_arm_thunk_idx == -1) {
sym->extra.thumb_to_arm_thunk_idx = symbols.size();
symbols.push_back(sym);
}
}
void ThumbToArmSection::update_shdr(Context<E> &ctx) {
this->shdr.sh_size = symbols.size() * ENTRY_SIZE;
}
void ThumbToArmSection::copy_buf(Context<E> &ctx) {
u8 *buf = ctx.buf + this->shdr.sh_offset;
i64 offset = 0;
static u8 insn[] = {
0x40, 0xf2, 0x00, 0x0c, // movw ip, $0
0xc0, 0xf2, 0x00, 0x0c, // movt ip, $0
0xfc, 0x44, // add ip, pc
0x60, 0x47, // bx ip
};
static_assert(sizeof(insn) == ENTRY_SIZE);
for (Symbol<E> *sym : symbols) {
memcpy(buf + offset, insn, sizeof(insn));
u32 val = sym->get_addr(ctx) - this->shdr.sh_addr - offset - 12;
write_thm_mov_imm(buf + offset, val);
write_thm_mov_imm(buf + offset + 4, val >> 16);
offset += sizeof(insn);
}
}
void TlsTrampolineSection::copy_buf(Context<E> &ctx) {
// Trampoline code for TLSDESC
static u32 insn[] = {
0xe08e0000, // add r0, lr, r0
0xe5901004, // ldr r1, [r0, #4]
0xe12fff11, // bx r1
};
memcpy(ctx.buf + this->shdr.sh_offset, insn, sizeof(insn));
}
// ARM executables use an .ARM.exidx section to look up an exception
// handling record for the current instruction pointer. The table needs
// to be sorted by their addresses.
//
// Other target uses .eh_frame_hdr instead for the same purpose.
// I don't know why only ARM uses the different mechanism, but it's
// likely that it's due to some historical reason.
//
// This function sorts .ARM.exidx records.
void sort_arm_exidx(Context<E> &ctx) {
Timer t(ctx, "sort_arm_exidx");
auto find_exidx = [&]() -> OutputSection<E> * {
for (std::unique_ptr<OutputSection<E>> &osec : ctx.output_sections)
if (osec->shdr.sh_type == SHT_ARM_EXIDX)
return osec.get();
return nullptr;
};
OutputSection<E> *osec = find_exidx();
if (!osec)
return;
// .ARM.exidx records consists of a signed 31-bit relative address
// and a 32-bit value. The relative address indicates the start
// address of a function that the record covers. The value is one of
// the followings:
//
// 1. CANTUNWIND indicating that there's no unwinding info for the function,
// 2. a compact unwinding record encoded into a 32-bit value, or
// 3. a 31-bit relative address which points to a larger record in
// the .ARM.extab section.
//
// CANTUNWIND is value 1. The most significant is set in (2) but not
// in (3). So they can be distinguished just by looking at a value.
struct Entry {
ul32 addr;
ul32 val;
};
if (osec->shdr.sh_size % sizeof(Entry))
Fatal(ctx) << "invalid .ARM.exidx section size";
Entry *begin = (Entry *)(ctx.buf + osec->shdr.sh_offset);
Entry *end = (Entry *)(ctx.buf + osec->shdr.sh_offset + osec->shdr.sh_size);
struct Entry2 {
u32 addr;
u32 val;
u32 idx;
};
// Read section contents
std::vector<Entry2> vec;
vec.reserve(end - begin);
for (Entry *it = begin; it < end; it++)
vec.push_back({it->addr, it->val, (u32)(it - begin)});
// Sort the records
tbb::parallel_sort(vec.begin(), vec.end(), [](const Entry2 &a, const Entry2 &b) {
return sign_extend(a.addr, 30) + a.idx * sizeof(Entry) <
sign_extend(b.addr, 30) + b.idx * sizeof(Entry);
});
// Write back the sorted records while adjusting relative addresses
for (i64 i = 0; i < vec.size(); i++) {
u32 offset = (vec[i].idx - i) * sizeof(Entry);
begin[i].addr = 0x7fff'ffff & (sign_extend(vec[i].addr, 30) + offset);
const u32 EXIDX_CANTUNWIND = 1;
if (vec[i].val == EXIDX_CANTUNWIND || (vec[i].val & 0x8000'0000))
begin[i].val = vec[i].val;
else
begin[i].val = 0x7fff'ffff & (sign_extend(vec[i].val, 30) + offset);
}
}
} // namespace mold::elf