1
1
mirror of https://github.com/rui314/mold.git synced 2024-09-20 01:18:53 +03:00

Implement range extension thunks for LoongArch

This commit is contained in:
Rui Ueyama 2023-08-11 10:11:00 +09:00
parent 9441170d39
commit 3b5ccb9342
6 changed files with 52 additions and 15 deletions

View File

@ -590,7 +590,7 @@ void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
u64 P = output_section.shdr.sh_addr + offset + i * E::thunk_size;
u8 *loc = buf + i * E::thunk_size;
memcpy(loc , data, sizeof(data));
memcpy(loc, data, sizeof(data));
write_adrp(loc, page(S) - page(P));
*(ul32 *)(loc + 4) |= bits(S, 11, 0) << 10;
}

View File

@ -11,8 +11,12 @@
// Linux, GCC, LLVM, etc.
//
// All instructions are 4 bytes long in LoongArch and aligned to 4-byte
// boundaries. The psABI defines a few linker relaxations. We haven't
// supported them yet, though.
// boundaries. It has 32 general-purpose registers. Among these, $t0 - $t8
// (aliases for $r12 - $r20) are temporary registers that we can use in
// our PLT and range extension thunks.
//
// The psABI defines a few linker relaxations. We haven't supported them
// yet.
//
// https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html
@ -33,7 +37,7 @@ static u64 hi20(u64 val, u64 pc) {
// register with the following instructions:
//
// pcalau12i $rN, %hi20(sym)
// addi.d $rN, $zero, %lo12(sym)
// addi.d $rN, $rN, %lo12(sym)
//
// PCALAU12I materializes bits [63:12] by computing (pc + imm << 12)
// and zero-clear [11:0]. ADDI.D sign-extends its 12 bit immediate and
@ -313,10 +317,13 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
check_branch(S + A - P, -(1 << 22), 1 << 22);
write_d5k16(loc, (S + A - P) >> 2);
break;
case R_LARCH_B26:
check_branch(S + A - P, -(1 << 27), 1 << 27);
write_d10k16(loc, (S + A - P) >> 2);
case R_LARCH_B26: {
i64 val = S + A - P;
if (val < -(1 << 27) || (1 << 27) <= val)
val = get_thunk_addr(i) + A - P;
write_d10k16(loc, val >> 2);
break;
}
case R_LARCH_ABS_HI20:
write_j20(loc, (S + A) >> 12);
break;
@ -668,6 +675,30 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
}
}
template <>
void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
static const ul32 insn[] = {
0x1a00'000c, // pcalau12i $t0, 0
0x02c0'018c, // addi.d $t0, $t0, 0
0x4c00'0180, // jirl $zero, $t0, 0
0x0340'0000, // nop
};
static_assert(E::thunk_size == sizeof(insn));
for (i64 i = 0; i < symbols.size(); i++) {
u64 S = symbols[i]->get_addr(ctx);
u64 P = output_section.shdr.sh_addr + offset + i * E::thunk_size;
u8 *loc = buf + i * E::thunk_size;
memcpy(loc, insn, sizeof(insn));
write_j20(loc, hi20(S, P) >> 12);
write_k12(loc + 4, S);
}
}
} // namespace mold::elf
#endif

View File

@ -2377,6 +2377,8 @@ struct LOONGARCH64 {
static constexpr u32 plt_hdr_size = 32;
static constexpr u32 plt_size = 16;
static constexpr u32 pltgot_size = 16;
static constexpr u32 thunk_hdr_size = 0;
static constexpr u32 thunk_size = 16;
static constexpr u32 R_COPY = R_LARCH_COPY;
static constexpr u32 R_GLOB_DAT = R_LARCH_64;
@ -2400,6 +2402,8 @@ struct LOONGARCH32 {
static constexpr u32 plt_hdr_size = 32;
static constexpr u32 plt_size = 16;
static constexpr u32 pltgot_size = 16;
static constexpr u32 thunk_hdr_size = 0;
static constexpr u32 thunk_size = 16;
static constexpr u32 R_COPY = R_LARCH_COPY;
static constexpr u32 R_GLOB_DAT = R_LARCH_32;

View File

@ -114,7 +114,7 @@ public:
idx * E::thunk_size;
}
static constexpr i64 alignment = 4;
static constexpr i64 alignment = 16;
OutputSection<E> &output_section;
i64 offset;

View File

@ -20,7 +20,8 @@
// we don't need to try too hard to reduce thunk size to the absolute
// minimum.
#if MOLD_ARM32 || MOLD_ARM64 || MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2
#if MOLD_ARM32 || MOLD_ARM64 || MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2 || \
MOLD_LOONGARCH64 || MOLD_LOONGARCH32
#include "mold.h"
@ -37,7 +38,9 @@ static consteval i64 max_distance() {
// and therefore the least two bits are always zero. So the branch
// operand is effectively 28 bits long. That means the branch range is
// [-2^27, 2^27) or PC ± 128 MiB.
if (is_arm64<E>)
//
// LoongArch's BR instruction also takes a 26 bit immediate.
if (is_arm64<E> || is_loongarch<E>)
return 1 << 27;
// ARM32's Thumb branch has 24 bits immediate, and the instructions are
@ -76,9 +79,11 @@ static bool needs_thunk_rel(const ElfRel<E> &r) {
ty == R_ARM_PLT32;
} else if constexpr (is_ppc32<E>) {
return ty == R_PPC_REL24 || ty == R_PPC_PLTREL24 || ty == R_PPC_LOCAL24PC;
} else {
static_assert(is_ppc64<E>);
} else if constexpr (is_ppc64<E>) {
return ty == R_PPC64_REL24 || ty == R_PPC64_REL24_NOTOC;
} else {
static_assert(is_loongarch<E>);
return ty == R_LARCH_B26;
}
}

View File

@ -8,9 +8,6 @@
# It looks like SPARC's runtime can't handle PLT if it's too far from GOT.
[ $MACHINE = sparc64 ] && skip
# The crt*.o compiled with B26 caused far form GOT.
[[ $MACHINE = loongarch* ]] && skip
cat <<EOF > $t/a.c
#include <stdio.h>