mold/elf/arch-arm32.cc

// ARM32 is a bit special from the linker's viewpoint because ARM
// processors support two different instruction encodings: Thumb and
// ARM (in a narrower sense). Thumb instructions are either 16 bits or
// 32 bits, while ARM instructions are all 32 bits. Feature-wise,
// Thumb is a subset of ARM, so not all ARM instructions are
// representable in Thumb.
//
// ARM processors originally supported only ARM instructions. Thumb
// instructions were later added to increase code density.
//
// ARM processors runs in either ARM mode or Thumb mode. The mode can
// be switched using BX (branch and mode exchange)-family instructions.
// We need to use that instructions to, for example, call a function
// encoded in Thumb from a function encoded in ARM. Sometimes, the
// linker even has to emit interworking thunk code to switch mode.
//
// ARM instructions are aligned to 4 byte boundaries. Thumb are to 2
// byte boundaries. So the least significant bit of a function address
// is always 0.
//
// To distinguish Thumb functions from ARM fucntions, the LSB of a
// function address is repurposed as a boolean flag. If the LSB is 0,
// the function referred to by the address is encoded in ARM;
// otherwise, Thumb.
//
// For example, if a symbol `foo` is of type STT_FUNC and has value
// 0x2001, `foo` is a function using Thumb instructions whose address
// is 0x2000 (not 0x2001, as Thumb instructions are always 2-byte
// aligned). Likewise, if a function pointer has value 0x2001, it
// refers a Thumb function at 0x2000.
//
// https://github.com/ARM-software/abi-aa/blob/main/aaelf32/aaelf32.rst

#include "mold.h"

#include <tbb/parallel_for.h>
#include <tbb/parallel_for_each.h>
#include <tbb/parallel_sort.h>

namespace mold::elf {

using E = ARM32;

template <>
i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
  switch (rel.r_type) {
  case R_ARM_ABS32:
  case R_ARM_REL32:
  case R_ARM_TARGET1:
  case R_ARM_BASE_PREL:
  case R_ARM_GOTOFF32:
  case R_ARM_GOT_PREL:
  case R_ARM_GOT_BREL:
  case R_ARM_TLS_GD32:
  case R_ARM_TLS_LDM32:
  case R_ARM_TLS_LDO32:
  case R_ARM_TLS_IE32:
  case R_ARM_TLS_LE32:
  case R_ARM_TLS_GOTDESC:
  case R_ARM_TARGET2:
    return *(il32 *)loc;
  case R_ARM_THM_JUMP11:
    return sign_extend(*(ul16 *)loc, 10) << 1;
  case R_ARM_THM_CALL:
  case R_ARM_THM_JUMP24:
  case R_ARM_THM_TLS_CALL: {
    u32 S = bit(*(ul16 *)loc, 10);
    u32 J1 = bit(*(ul16 *)(loc + 2), 13);
    u32 J2 = bit(*(ul16 *)(loc + 2), 11);
    u32 I1 = !(J1 ^ S);
    u32 I2 = !(J2 ^ S);
    u32 imm10 = bits(*(ul16 *)loc, 9, 0);
    u32 imm11 = bits(*(ul16 *)(loc + 2), 10, 0);
    u32 val = (S << 24) | (I1 << 23) | (I2 << 22) | (imm10 << 12) | (imm11 << 1);
    return sign_extend(val, 24);
  }
  case R_ARM_CALL:
  case R_ARM_JUMP24:
  case R_ARM_PLT32:
  case R_ARM_TLS_CALL:
    return sign_extend(*(ul32 *)loc, 23) << 2;
  case R_ARM_MOVW_PREL_NC:
  case R_ARM_MOVW_ABS_NC:
  case R_ARM_MOVT_PREL:
  case R_ARM_MOVT_ABS: {
    u32 imm12 = bits(*(ul32 *)loc, 11, 0);
    u32 imm4 = bits(*(ul32 *)loc, 19, 16);
    return sign_extend((imm4 << 12) | imm12, 15);
  }
  case R_ARM_PREL31:
    return sign_extend(*(ul32 *)loc, 30);
  case R_ARM_THM_MOVW_PREL_NC:
  case R_ARM_THM_MOVW_ABS_NC:
  case R_ARM_THM_MOVT_PREL:
  case R_ARM_THM_MOVT_ABS: {
    u32 imm4 = bits(*(ul16 *)loc, 3, 0);
    u32 i = bit(*(ul16 *)loc, 10);
    u32 imm3 = bits(*(ul16 *)(loc + 2), 14, 12);
    u32 imm8 = bits(*(ul16 *)(loc + 2), 7, 0);
    u32 val = (imm4 << 12) | (i << 11) | (imm3 << 8) | imm8;
    return sign_extend(val, 15);
  }
  default:
    return 0;
  }
}

static void write_mov_imm(u8 *loc, u32 val) {
  u32 imm12 = bits(val, 11, 0);
  u32 imm4 = bits(val, 15, 12);
  *(ul32 *)loc = (*(ul32 *)loc & 0xfff0f000) | (imm4 << 16) | imm12;
}

static void write_thm_b_imm(u8 *loc, u32 val) {
  // https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/BL--BLX--immediate-
  u32 sign = bit(val, 24);
  u32 I1 = bit(val, 23);
  u32 I2 = bit(val, 22);
  u32 J1 = !I1 ^ sign;
  u32 J2 = !I2 ^ sign;
  u32 imm10 = bits(val, 21, 12);
  u32 imm11 = bits(val, 11, 1);

  ul16 *buf = (ul16 *)loc;
  buf[0] = (buf[0] & 0b1111'1000'0000'0000) | (sign << 10) | imm10;
  buf[1] = (buf[1] & 0b1101'0000'0000'0000) | (J1 << 13) | (J2 << 11) | imm11;
}

static void write_thm_mov_imm(u8 *loc, u32 val) {
  // https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/MOVT
  u32 imm4 = bits(val, 15, 12);
  u32 i = bit(val, 11);
  u32 imm3 = bits(val, 10, 8);
  u32 imm8 = bits(val, 7, 0);

  ul16 *buf = (ul16 *)loc;
  buf[0] = (buf[0] & 0b1111'1011'1111'0000) | (i << 10) | imm4;
  buf[1] = (buf[1] & 0b1000'1111'0000'0000) | (imm3 << 12) | imm8;
}

template <>
void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
  switch (rel.r_type) {
  case R_ARM_NONE:
    break;
  case R_ARM_ABS32:
  case R_ARM_REL32:
  case R_ARM_TARGET1:
  case R_ARM_BASE_PREL:
  case R_ARM_GOTOFF32:
  case R_ARM_GOT_PREL:
  case R_ARM_GOT_BREL:
  case R_ARM_TLS_GD32:
  case R_ARM_TLS_LDM32:
  case R_ARM_TLS_LDO32:
  case R_ARM_TLS_IE32:
  case R_ARM_TLS_LE32:
  case R_ARM_TLS_GOTDESC:
  case R_ARM_TARGET2:
    *(ul32 *)loc = val;
    break;
  case R_ARM_THM_JUMP11:
    *(ul16 *)loc = (*(ul16 *)loc & 0xf800) | bits(val, 11, 1);
    break;
  case R_ARM_THM_CALL:
  case R_ARM_THM_JUMP24:
  case R_ARM_THM_TLS_CALL:
    write_thm_b_imm(loc, val);
    break;
  case R_ARM_CALL:
  case R_ARM_JUMP24:
  case R_ARM_PLT32:
    *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
    break;
  case R_ARM_MOVW_PREL_NC:
  case R_ARM_MOVW_ABS_NC:
  case R_ARM_MOVT_PREL:
  case R_ARM_MOVT_ABS:
    write_mov_imm(loc, val);
    break;
  case R_ARM_PREL31:
    *(ul32 *)loc = (*(ul32 *)loc & 0x8000'0000) | (val & 0x7fff'ffff);
    break;
  case R_ARM_THM_MOVW_PREL_NC:
  case R_ARM_THM_MOVW_ABS_NC:
  case R_ARM_THM_MOVT_PREL:
  case R_ARM_THM_MOVT_ABS:
    write_thm_mov_imm(loc, val);
    break;
  default:
    unreachable();
  }
}

template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
  static const ul32 insn[] = {
    0xe52d'e004, //    push {lr}
    0xe59f'e004, //    ldr lr, 2f
    0xe08f'e00e, // 1: add lr, pc, lr
    0xe5be'f008, //    ldr pc, [lr, #8]!
    0x0000'0000, // 2: .word .got.plt - 1b - 8
    0x0000'0000, //    (padding)
    0x0000'0000, //    (padding)
    0x0000'0000, //    (padding)
  };

  memcpy(buf, insn, sizeof(insn));
  *(ul32 *)(buf + 16) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 16;
}

static const ul32 plt_entry[] = {
  0xe59f'c004, // 1: ldr ip, 2f
  0xe08c'c00f, //    add ip, ip, pc
  0xe59c'f000, //    ldr pc, [ip]
  0x0000'0000, // 2: .word sym@GOT - 1b
};

template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
  memcpy(buf, plt_entry, sizeof(plt_entry));
  *(ul32 *)(buf + 12) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 12;
}

template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
  memcpy(buf, plt_entry, sizeof(plt_entry));
  *(ul32 *)(buf + 12) = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 12;
}

template <>
void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
                                       u64 offset, u64 val) {
  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;

  switch (rel.r_type) {
  case R_NONE:
    break;
  case R_ARM_ABS32:
    *(ul32 *)loc = val;
    break;
  case R_ARM_REL32:
    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
    break;
  default:
    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
  }
}

// ARM and Thumb branch instructions can jump within ±16 MiB.
static bool is_jump_reachable(i64 val) {
  return sign_extend(val, 24) == val;
}

template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
  std::span<const ElfRel<E>> rels = get_rels(ctx);

  ElfRel<E> *dynrel = nullptr;
  if (ctx.reldyn)
    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
                           file.reldyn_offset + this->reldyn_offset);

  auto get_tls_trampoline_addr = [&, i = 0](u64 addr) mutable {
    for (; i < output_section->thunks.size(); i++) {
      i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset -
                 addr;
      if (is_jump_reachable(disp))
        return disp;
    }
    unreachable();
  };

  for (i64 i = 0; i < rels.size(); i++) {
    const ElfRel<E> &rel = rels[i];
    if (rel.r_type == R_NONE || rel.r_type == R_ARM_V4BX)
      continue;

    Symbol<E> &sym = *file.symbols[rel.r_sym];
    u8 *loc = base + rel.r_offset;

    auto check = [&](i64 val, i64 lo, i64 hi) {
      if (val < lo || hi <= val)
        Error(ctx) << *this << ": relocation " << rel << " against "
                   << sym << " out of range: " << val << " is not in ["
                   << lo << ", " << hi << ")";
    };

    u64 S = sym.get_addr(ctx);
    u64 A = get_addend(*this, rel);
    u64 P = get_addr() + rel.r_offset;
    u64 T = S & 1;
    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
    u64 GOT = ctx.got->shdr.sh_addr;

    auto get_thumb_thunk_addr = [&] { return get_thunk_addr(i); };
    auto get_arm_thunk_addr   = [&] { return get_thunk_addr(i) + 4; };

    switch (rel.r_type) {
    case R_ARM_ABS32:
    case R_ARM_TARGET1:
      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
      break;
    case R_ARM_REL32:
      *(ul32 *)loc = S + A - P;
      break;
    case R_ARM_THM_CALL: {
      if (sym.is_remaining_undef_weak()) {
        // On ARM, calling an weak undefined symbol jumps to the
        // next instruction.
        *(ul32 *)loc = 0x8000'f3af; // NOP.W
        break;
      }

      // THM_CALL relocation refers either BL or BLX instruction.
      // They are different in only one bit. We need to use BL if
      // the jump target is Thumb. Otherwise, use BLX.
      i64 val = S + A - P;
      if (is_jump_reachable(val)) {
        if (T) {
          write_thm_b_imm(loc, val);
          *(ul16 *)(loc + 2) |= 0x1000;  // rewrite to BL
        } else {
          write_thm_b_imm(loc, align_to(val, 4));
          *(ul16 *)(loc + 2) &= ~0x1000; // rewrite to BLX
        }
      } else {
        write_thm_b_imm(loc, align_to(get_arm_thunk_addr() + A - P, 4));
        *(ul16 *)(loc + 2) &= ~0x1000;  // rewrite to BLX
      }
      break;
    }
    case R_ARM_BASE_PREL:
      *(ul32 *)loc = GOT + A - P;
      break;
    case R_ARM_GOTOFF32:
      *(ul32 *)loc = ((S + A) | T) - GOT;
      break;
    case R_ARM_GOT_PREL:
    case R_ARM_TARGET2:
      *(ul32 *)loc = GOT + G + A - P;
      break;
    case R_ARM_GOT_BREL:
      *(ul32 *)loc = G + A;
      break;
    case R_ARM_CALL: {
      if (sym.is_remaining_undef_weak()) {
        *(ul32 *)loc = 0xe320'f000; // NOP
        break;
      }

      // Just like THM_CALL, ARM_CALL relocation refers to either BL or
      // BLX instruction. We may need to rewrite BL → BLX or BLX → BL.
      bool is_bl = ((*(ul32 *)loc & 0xff00'0000) == 0xeb00'0000);
      bool is_blx = ((*(ul32 *)loc & 0xfe00'0000) == 0xfa00'0000);
      if (!is_bl && !is_blx)
        Fatal(ctx) << *this << ": R_ARM_CALL refers to neither BL nor BLX";

      u64 val = S + A - P;
      if (is_jump_reachable(val)) {
        if (T) {
          *(ul32 *)loc = 0xfa00'0000; // BLX
          *(ul32 *)loc |= (bit(val, 1) << 24) | bits(val, 25, 2);
        } else {
          *(ul32 *)loc = 0xeb00'0000; // BL
          *(ul32 *)loc |= bits(val, 25, 2);
        }
      } else {
        *(ul32 *)loc = 0xeb00'0000; // BL
        *(ul32 *)loc |= bits(get_arm_thunk_addr() + A - P, 25, 2);
      }
      break;
    }
    case R_ARM_JUMP24: {
      if (sym.is_remaining_undef_weak()) {
        *(ul32 *)loc = 0xe320'f000; // NOP
        break;
      }

      // These relocs refers to a B (unconditional branch) instruction.
      // Unlike BL or BLX, we can't rewrite B to BX in place when the
      // processor mode switch is required because BX doesn't takes an
      // immediate; it takes only a register. So if mode switch is
      // required, we jump to a linker-synthesized thunk which does the
      // job with a longer code sequence.
      u64 val = S + A - P;
      if (!is_jump_reachable(val) || T)
        val = get_arm_thunk_addr() + A - P;
      *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
      break;
    }
    case R_ARM_PLT32:
      if (sym.is_remaining_undef_weak()) {
        *(ul32 *)loc = 0xe320'f000; // NOP
      } else {
        u64 val = (T ? get_arm_thunk_addr() : S) + A - P;
        *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
      }
      break;
    case R_ARM_THM_JUMP11:
      assert(T);
      check(S + A - P, -(1 << 11), 1 << 11);
      *(ul16 *)loc &= 0xf800;
      *(ul16 *)loc |= bits(S + A - P, 11, 1);
      break;
    case R_ARM_THM_JUMP19: {
      i64 val = S + A - P;
      check(val, -(1 << 19), 1 << 19);

      // sign:J2:J1:imm6:imm11:'0'
      u32 sign = bit(val, 20);
      u32 J2 = bit(val, 19);
      u32 J1 = bit(val, 18);
      u32 imm6 = bits(val, 17, 12);
      u32 imm11 = bits(val, 11, 1);

      *(ul16 *)loc &= 0b1111'1011'1100'0000;
      *(ul16 *)loc |= (sign << 10) | imm6;

      *(ul16 *)(loc + 2) &= 0b1101'0000'0000'0000;
      *(ul16 *)(loc + 2) |= (J2 << 13) | (J1 << 11) | imm11;
      break;
    }
    case R_ARM_THM_JUMP24: {
      if (sym.is_remaining_undef_weak()) {
        *(ul32 *)loc = 0x8000'f3af; // NOP
        break;
      }

      // Just like R_ARM_JUMP24, we need to jump to a thunk if we need to
      // switch processor mode.
      u64 val = S + A - P;
      if (!is_jump_reachable(val) || !T)
        val = get_thumb_thunk_addr() + A - P;
      write_thm_b_imm(loc, val);
      break;
    }
    case R_ARM_MOVW_PREL_NC:
      write_mov_imm(loc, ((S + A) | T) - P);
      break;
    case R_ARM_MOVW_ABS_NC:
      write_mov_imm(loc, (S + A) | T);
      break;
    case R_ARM_THM_MOVW_PREL_NC:
      write_thm_mov_imm(loc, ((S + A) | T) - P);
      break;
    case R_ARM_PREL31:
      check(S + A - P, -(1LL << 30), 1LL << 30);
      *(ul32 *)loc &= 0x8000'0000;
      *(ul32 *)loc |= (S + A - P) & 0x7fff'ffff;
      break;
    case R_ARM_THM_MOVW_ABS_NC:
      write_thm_mov_imm(loc, (S + A) | T);
      break;
    case R_ARM_MOVT_PREL:
      write_mov_imm(loc, (S + A - P) >> 16);
      break;
    case R_ARM_THM_MOVT_PREL:
      write_thm_mov_imm(loc, (S + A - P) >> 16);
      break;
    case R_ARM_MOVT_ABS:
      write_mov_imm(loc, (S + A) >> 16);
      break;
    case R_ARM_THM_MOVT_ABS:
      write_thm_mov_imm(loc, (S + A) >> 16);
      break;
    case R_ARM_TLS_GD32:
      *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - P;
      break;
    case R_ARM_TLS_LDM32:
      *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - P;
      break;
    case R_ARM_TLS_LDO32:
      *(ul32 *)loc = S + A - ctx.dtp_addr;
      break;
    case R_ARM_TLS_IE32:
      *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - P;
      break;
    case R_ARM_TLS_LE32:
      *(ul32 *)loc = S + A - ctx.tp_addr;
      break;
    case R_ARM_TLS_GOTDESC:
      // ARM32 TLSDESC uses the following code sequence to materialize
      // a TP-relative address in r0.
      //
      //       ldr     r0, .L2
      //  .L1: bl      foo
      //           R_ARM_TLS_CALL
      //  .L2: .word   foo + . - .L1
      //           R_ARM_TLS_GOTDESC
      //
      // We may relax the instructions to the following for non-dlopen'd DSO
      //
      //       ldr     r0, .L2
      //  .L1: ldr r0, [pc, r0]
      //       ...
      //  .L2: .word   foo(gottpoff) + . - .L1
      //
      // or to the following for executable.
      //
      //       ldr     r0, .L2
      //  .L1: nop
      //       ...
      //  .L2: .word   foo(tpoff)
      if (sym.has_tlsdesc(ctx)) {
        // A is odd if the corresponding TLS_CALL is Thumb.
        *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - ((A & 1) ? 6 : 4);
      } else if (sym.has_gottp(ctx)) {
        *(ul32 *)loc = sym.get_gottp_addr(ctx) - P + A - ((A & 1) ? 5 : 8);
      } else {
        *(ul32 *)loc = S - ctx.tp_addr;
      }
      break;
    case R_ARM_TLS_CALL:
      if (sym.has_tlsdesc(ctx)) {
        // BL <tls_trampoline>
        *(ul32 *)loc = 0xeb00'0000 | bits(get_tls_trampoline_addr(P + 8), 25, 2);
      } else if (sym.has_gottp(ctx)) {
        *(ul32 *)loc = 0xe79f'0000; // ldr r0, [pc, r0]
      } else {
        *(ul32 *)loc = 0xe320'f000; // nop
      }
      break;
    case R_ARM_THM_TLS_CALL:
      if (sym.has_tlsdesc(ctx)) {
        u64 val = align_to(get_tls_trampoline_addr(P + 4), 4);
        write_thm_b_imm(loc, val);
        *(ul16 *)(loc + 2) &= ~0x1000; // rewrite BL with BLX
      } else if (sym.has_gottp(ctx)) {
        // Since `ldr r0, [pc, r0]` is not representable in Thumb,
        // we use two instructions instead.
        *(ul16 *)loc = 0x4478;         // add r0, pc
        *(ul16 *)(loc + 2) = 0x6800;   // ldr r0, [r0]
      } else {
        *(ul32 *)loc = 0x8000'f3af;    // nop.w
      }
      break;
    default:
      Error(ctx) << *this << ": unknown relocation: " << rel;
    }
  }
}

template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
  std::span<const ElfRel<E>> rels = get_rels(ctx);

  for (i64 i = 0; i < rels.size(); i++) {
    const ElfRel<E> &rel = rels[i];
    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
      continue;

    Symbol<E> &sym = *file.symbols[rel.r_sym];
    u8 *loc = base + rel.r_offset;

    SectionFragment<E> *frag;
    i64 frag_addend;
    std::tie(frag, frag_addend) = get_fragment(ctx, rel);

    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
    u64 A = frag ? frag_addend : get_addend(*this, rel);

    switch (rel.r_type) {
    case R_ARM_ABS32:
      if (std::optional<u64> val = get_tombstone(sym, frag))
        *(ul32 *)loc = *val;
      else
        *(ul32 *)loc = S + A;
      break;
    case R_ARM_TLS_LDO32:
      if (std::optional<u64> val = get_tombstone(sym, frag))
        *(ul32 *)loc = *val;
      else
        *(ul32 *)loc = S + A - ctx.dtp_addr;
      break;
    default:
      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
                 << rel;
      break;
    }
  }
}

template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
  assert(shdr().sh_flags & SHF_ALLOC);

  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
  std::span<const ElfRel<E>> rels = get_rels(ctx);

  // Scan relocations
  for (i64 i = 0; i < rels.size(); i++) {
    const ElfRel<E> &rel = rels[i];
    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
      continue;

    Symbol<E> &sym = *file.symbols[rel.r_sym];

    if (sym.is_ifunc())
      sym.flags |= NEEDS_GOT | NEEDS_PLT;

    switch (rel.r_type) {
    case R_ARM_ABS32:
    case R_ARM_MOVT_ABS:
    case R_ARM_THM_MOVT_ABS:
    case R_ARM_TARGET1:
      scan_dyn_absrel(ctx, sym, rel);
      break;
    case R_ARM_MOVW_ABS_NC:
    case R_ARM_THM_MOVW_ABS_NC:
      scan_absrel(ctx, sym, rel);
      break;
    case R_ARM_THM_CALL:
    case R_ARM_CALL:
    case R_ARM_JUMP24:
    case R_ARM_PLT32:
    case R_ARM_THM_JUMP24:
      if (sym.is_imported)
        sym.flags |= NEEDS_PLT;
      break;
    case R_ARM_GOT_PREL:
    case R_ARM_GOT_BREL:
    case R_ARM_TARGET2:
      sym.flags |= NEEDS_GOT;
      break;
    case R_ARM_MOVT_PREL:
    case R_ARM_THM_MOVT_PREL:
    case R_ARM_PREL31:
      scan_pcrel(ctx, sym, rel);
      break;
    case R_ARM_TLS_GD32:
      sym.flags |= NEEDS_TLSGD;
      break;
    case R_ARM_TLS_LDM32:
      ctx.needs_tlsld = true;
      break;
    case R_ARM_TLS_IE32:
      sym.flags |= NEEDS_GOTTP;
      break;
    case R_ARM_TLS_CALL:
    case R_ARM_THM_TLS_CALL:
      scan_tlsdesc(ctx, sym);
      break;
    case R_ARM_TLS_LE32:
      check_tlsle(ctx, sym, rel);
      break;
    case R_ARM_REL32:
    case R_ARM_BASE_PREL:
    case R_ARM_GOTOFF32:
    case R_ARM_THM_JUMP11:
    case R_ARM_THM_JUMP19:
    case R_ARM_MOVW_PREL_NC:
    case R_ARM_THM_MOVW_PREL_NC:
    case R_ARM_TLS_LDO32:
    case R_ARM_V4BX:
    case R_ARM_TLS_GOTDESC:
      break;
    default:
      Error(ctx) << *this << ": unknown relocation: " << rel;
    }
  }
}

template <>
void Thunk<E>::copy_buf(Context<E> &ctx) {
  // TLS trampoline code. ARM32's TLSDESC is designed so that this
  // common piece of code is factored out from object files to reduce
  // output size. Since no one provide, the linker has to synthesize it.
  static ul32 hdr[] = {
    0xe08e'0000, // add r0, lr, r0
    0xe590'1004, // ldr r1, [r0, #4]
    0xe12f'ff11, // bx  r1
    0xe320'f000, // nop
  };

  // This is a range extension and mode switch thunk.
  // It has two entry points: +0 for Thumb and +4 for ARM.
  const u8 entry[] = {
    // .thumb
    0x78, 0x47,             //    bx   pc  # jumps to 1f
    0xc0, 0x46,             //    nop
    // .arm
    0x00, 0xc0, 0x9f, 0xe5, // 1: ldr  ip, 3f
    0x0f, 0xf0, 0x8c, 0xe0, // 2: add  pc, ip, pc
    0x00, 0x00, 0x00, 0x00, // 3: .word sym - 2b
  };

  static_assert(E::thunk_hdr_size == sizeof(hdr));
  static_assert(E::thunk_size == sizeof(entry));

  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
  memcpy(buf, hdr, sizeof(hdr));
  buf += sizeof(hdr);

  u64 P = output_section.shdr.sh_addr + offset + sizeof(hdr);

  for (Symbol<E> *sym : symbols) {
    memcpy(buf, entry, sizeof(entry));
    *(ul32 *)(buf + 12) = sym->get_addr(ctx) - P - 16;

    buf += sizeof(entry);
    P += sizeof(entry);
  }
}

template <>
u64 get_eflags(Context<E> &ctx) {
  return EF_ARM_EABI_VER5;
}

// ARM executables use an .ARM.exidx section to look up an exception
// handling record for the current instruction pointer. The table needs
// to be sorted by their addresses.
//
// Other target uses .eh_frame_hdr instead for the same purpose.
// I don't know why only ARM uses the different mechanism, but it's
// likely that it's due to some historical reason.
//
// This function sorts .ARM.exidx records.
void fixup_arm_exidx_section(Context<E> &ctx) {
  Timer t(ctx, "fixup_arm_exidx_section");

  OutputSection<E> *osec = find_section(ctx, SHT_ARM_EXIDX);
  if (!osec)
    return;

  // .ARM.exidx records consists of a signed 31-bit relative address
  // and a 32-bit value. The relative address indicates the start
  // address of a function that the record covers. The value is one of
  // the followings:
  //
  // 1. CANTUNWIND indicating that there's no unwinding info for the function,
  // 2. a compact unwinding record encoded into a 32-bit value, or
  // 3. a 31-bit relative address which points to a larger record in
  //    the .ARM.extab section.
  //
  // CANTUNWIND is value 1. The most significant bit is set in (2) but
  // not in (3). So we can distinguished them just by looking at a value.
  const u32 EXIDX_CANTUNWIND = 1;

  struct Entry {
    ul32 addr;
    ul32 val;
  };

  if (osec->shdr.sh_size % sizeof(Entry))
    Fatal(ctx) << "invalid .ARM.exidx section size";

  Entry *ent = (Entry *)(ctx.buf + osec->shdr.sh_offset);
  i64 num_entries = osec->shdr.sh_size / sizeof(Entry);

  // Entry's addresses are relative to themselves. In order to sort
  // records by addresses, we first translate them so that the addresses
  // are relative to the beginning of the section.
  auto is_relative = [](u32 val) {
    return val != EXIDX_CANTUNWIND && !(val & 0x8000'0000);
  };

  tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
    i64 offset = sizeof(Entry) * i;
    ent[i].addr = sign_extend(ent[i].addr, 30) + offset;
    if (is_relative(ent[i].val))
      ent[i].val = 0x7fff'ffff & (ent[i].val + offset);
  });

  tbb::parallel_sort(ent, ent + num_entries, [](const Entry &a, const Entry &b) {
    return a.addr < b.addr;
  });

  // Make addresses relative to themselves.
  tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
    i64 offset = sizeof(Entry) * i;
    ent[i].addr = 0x7fff'ffff & (ent[i].addr - offset);
    if (is_relative(ent[i].val))
      ent[i].val = 0x7fff'ffff & (ent[i].val - offset);
  });

  // .ARM.exidx's sh_link should be set to the .text section index.
  // Runtime doesn't care about it, but the binutils's strip command does.
  if (ctx.shdr) {
    if (Chunk<E> *text = find_section(ctx, ".text")) {
      osec->shdr.sh_link = text->shndx;
      ctx.shdr->copy_buf(ctx);
    }
  }
}

} // namespace mold::elf