Implement range extension thunks for LoongArch

2024-09-20 01:18:53 +03:00 · 2023-08-11 10:11:00 +09:00 · 2023-08-11 10:11:00 +09:00 · 3b5ccb9342
commit 3b5ccb9342
parent 9441170d39
6 changed files with 52 additions and 15 deletions
--- a/elf/arch-arm64.cc
+++ b/elf/arch-arm64.cc
@ -590,7 +590,7 @@ void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
    u64 P = output_section.shdr.sh_addr + offset + i * E::thunk_size;

    u8 *loc = buf + i * E::thunk_size;
-    memcpy(loc , data, sizeof(data));
+    memcpy(loc, data, sizeof(data));
    write_adrp(loc, page(S) - page(P));
    *(ul32 *)(loc + 4) |= bits(S, 11, 0) << 10;
  }
--- a/elf/arch-loongarch.cc
+++ b/elf/arch-loongarch.cc
@ -11,8 +11,12 @@
 // Linux, GCC, LLVM, etc.
 //
 // All instructions are 4 bytes long in LoongArch and aligned to 4-byte
-// boundaries. The psABI defines a few linker relaxations. We haven't
-// supported them yet, though.
+// boundaries. It has 32 general-purpose registers. Among these, $t0 - $t8
+// (aliases for $r12 - $r20) are temporary registers that we can use in
+// our PLT and range extension thunks.
+//
+// The psABI defines a few linker relaxations. We haven't supported them
+// yet.
 //
 // https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html

@ -33,7 +37,7 @@ static u64 hi20(u64 val, u64 pc) {
  // register with the following instructions:
  //
  //   pcalau12i $rN, %hi20(sym)
-  //   addi.d    $rN, $zero, %lo12(sym)
+  //   addi.d    $rN, $rN, %lo12(sym)
  //
  // PCALAU12I materializes bits [63:12] by computing (pc + imm << 12)
  // and zero-clear [11:0]. ADDI.D sign-extends its 12 bit immediate and
@ -313,10 +317,13 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
      check_branch(S + A - P, -(1 << 22), 1 << 22);
      write_d5k16(loc, (S + A - P) >> 2);
      break;
-    case R_LARCH_B26:
-      check_branch(S + A - P, -(1 << 27), 1 << 27);
-      write_d10k16(loc, (S + A - P) >> 2);
+    case R_LARCH_B26: {
+      i64 val = S + A - P;
+      if (val < -(1 << 27) || (1 << 27) <= val)
+        val = get_thunk_addr(i) + A - P;
+      write_d10k16(loc, val >> 2);
      break;
+    }
    case R_LARCH_ABS_HI20:
      write_j20(loc, (S + A) >> 12);
      break;
@ -668,6 +675,30 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
  }
 }

+template <>
+void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
+  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
+
+  static const ul32 insn[] = {
+    0x1a00'000c, // pcalau12i $t0, 0
+    0x02c0'018c, // addi.d    $t0, $t0, 0
+    0x4c00'0180, // jirl      $zero, $t0, 0
+    0x0340'0000, // nop
+  };
+
+  static_assert(E::thunk_size == sizeof(insn));
+
+  for (i64 i = 0; i < symbols.size(); i++) {
+    u64 S = symbols[i]->get_addr(ctx);
+    u64 P = output_section.shdr.sh_addr + offset + i * E::thunk_size;
+
+    u8 *loc = buf + i * E::thunk_size;
+    memcpy(loc, insn, sizeof(insn));
+    write_j20(loc, hi20(S, P) >> 12);
+    write_k12(loc + 4, S);
+  }
+}
+
 } // namespace mold::elf

 #endif
--- a/elf/elf.h
+++ b/elf/elf.h
@ -2377,6 +2377,8 @@ struct LOONGARCH64 {
  static constexpr u32 plt_hdr_size = 32;
  static constexpr u32 plt_size = 16;
  static constexpr u32 pltgot_size = 16;
+  static constexpr u32 thunk_hdr_size = 0;
+  static constexpr u32 thunk_size = 16;

  static constexpr u32 R_COPY = R_LARCH_COPY;
  static constexpr u32 R_GLOB_DAT = R_LARCH_64;
@ -2400,6 +2402,8 @@ struct LOONGARCH32 {
  static constexpr u32 plt_hdr_size = 32;
  static constexpr u32 plt_size = 16;
  static constexpr u32 pltgot_size = 16;
+  static constexpr u32 thunk_hdr_size = 0;
+  static constexpr u32 thunk_size = 16;

  static constexpr u32 R_COPY = R_LARCH_COPY;
  static constexpr u32 R_GLOB_DAT = R_LARCH_32;
--- a/elf/mold.h
+++ b/elf/mold.h
@ -114,7 +114,7 @@ public:
           idx * E::thunk_size;
  }

-  static constexpr i64 alignment = 4;
+  static constexpr i64 alignment = 16;

  OutputSection<E> &output_section;
  i64 offset;
--- a/elf/thunks.cc
+++ b/elf/thunks.cc
@ -20,7 +20,8 @@
 // we don't need to try too hard to reduce thunk size to the absolute
 // minimum.

-#if MOLD_ARM32 || MOLD_ARM64 || MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2
+#if MOLD_ARM32 || MOLD_ARM64 || MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2 || \
+    MOLD_LOONGARCH64 || MOLD_LOONGARCH32

 #include "mold.h"

@ -37,7 +38,9 @@ static consteval i64 max_distance() {
  // and therefore the least two bits are always zero. So the branch
  // operand is effectively 28 bits long. That means the branch range is
  // [-2^27, 2^27) or PC ± 128 MiB.
-  if (is_arm64<E>)
+  //
+  // LoongArch's BR instruction also takes a 26 bit immediate.
+  if (is_arm64<E> || is_loongarch<E>)
    return 1 << 27;

  // ARM32's Thumb branch has 24 bits immediate, and the instructions are
@ -76,9 +79,11 @@ static bool needs_thunk_rel(const ElfRel<E> &r) {
           ty == R_ARM_PLT32;
  } else if constexpr (is_ppc32<E>) {
    return ty == R_PPC_REL24  || ty == R_PPC_PLTREL24 || ty == R_PPC_LOCAL24PC;
-  } else {
-    static_assert(is_ppc64<E>);
+  } else if constexpr (is_ppc64<E>) {
    return ty == R_PPC64_REL24 || ty == R_PPC64_REL24_NOTOC;
+  } else {
+    static_assert(is_loongarch<E>);
+    return ty == R_LARCH_B26;
  }
 }

--- a/test/elf/range-extension-thunk.sh
+++ b/test/elf/range-extension-thunk.sh
@ -8,9 +8,6 @@
 # It looks like SPARC's runtime can't handle PLT if it's too far from GOT.
 [ $MACHINE = sparc64 ] && skip

-# The crt*.o compiled with B26 caused far form GOT.
-[[ $MACHINE = loongarch* ]] && skip
-
 cat <<EOF > $t/a.c
 #include <stdio.h>