mold/elf/arch-riscv.cc

// RISC-V is a clean RISC ISA. It supports PC-relative load/store for
// position-independent code. Its 32-bit and 64-bit ISAs are almost
// identical. That is, you can think RV32 as a RV64 without 64-bit
// operations. In this file, we support both RV64 and RV32.
//
// RISC-V is essentially little-endian, but the big-endian version is
// available as an extension. GCC supports `-mbig-endian` to generate
// big-endian code. Even in big-endian mode, machine instructions are
// defined to be encoded in little-endian, though. Only the behavior of
// load/store instructions are different between LE RISC-V and BE RISC-V.
//
// From the linker's point of view, the RISC-V's psABI is unique because
// sections in input object files can be shrunk while being copied to the
// output file. That is contrary to other psABIs in which sections are an
// atomic unit of copying. Let me explain it in more details.
//
// Since RISC-V instructions are 16-bit or 32-bit long, there's no way to
// embed a very large immediate into a branch instruction. In fact, JAL
// (jump and link) instruction can jump to only within PC ± 1 MiB because
// its immediate is only 21 bits long. If the destination is out of its
// reach, we need to use two instructions instead; the first instruction
// being AUIPC which sets upper 20 bits to a register and the second being
// JALR with a 12-bit immediate and the register. Combined, they specify a
// 32 bits displacement.
//
// Other RISC ISAs have the same limitation, and they solved the problem by
// letting the linker create so-called "range extension thunks". It works as
// follows: the compiler optimistically emits single jump instructions for
// function calls. If the linker finds that a branch target is out of reach,
// it emits a small piece of machine code near the branch instruction and
// redirect the branch to the linker-synthesized code. The code constructs a
// full 32-bit address in a register and jump to the destination. That
// linker-synthesized code is called "range extension thunks" or just
// "thunks".
//
// The RISC-V psABI is unique that it works the other way around. That is,
// for RISC-V, the compiler always emits two instructions (AUIPC + JAL) for
// function calls. If the linker finds the destination is reachable with a
// single instruction, it replaces the two instructions with the one and
// shrink the section size by one instruction length, instead of filling the
// gap with a nop.
//
// With the presence of this relaxation, sections can no longer be
// considered as an atomic unit. If we delete 4 bytes from the middle of a
// section, all contents after that point needs to be shifted by 4. Symbol
// values and relocation offsets have to be adjusted accordingly if they
// refer to past the deleted bytes.
//
// In mold, we use `r_deltas` to memorize how many bytes have be adjusted
// for relocations. For symbols, we directly mutate their `value` member.
//
// RISC-V object files tend to have way more relocations than those for
// other targets. This is because all branches, including ones that jump
// within the same section, are explicitly expressed with relocations.
// Here is why we need them: all control-flow statements such as `if` or
// `for` are implemented using branch instructions. For other targets, the
// compiler doesn't emit relocations for such branches because they know
// at compile-time exactly how many bytes has to be skipped. That's not
// true to RISC-V because the linker may delete bytes between a branch and
// its destination. Therefore, all branches including in-section ones have
// to be explicitly expressed with relocations.
//
// Note that this mechanism only shrink sections and never enlarge, as
// the compiler always emits the longest instruction sequence. This
// makes the linker implementation a bit simpler because we don't need
// to worry about oscillation.
//
// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc

#if MOLD_RV64LE || MOLD_RV64BE || MOLD_RV32LE || MOLD_RV32BE

#include "elf.h"
#include "mold.h"

#include <regex>
#include <tbb/parallel_for.h>
#include <tbb/parallel_for_each.h>

namespace mold::elf {

using E = MOLD_TARGET;

static void write_itype(u8 *loc, u32 val) {
  *(ul32 *)loc &= 0b000000'00000'11111'111'11111'1111111;
  *(ul32 *)loc |= bits(val, 11, 0) << 20;
}

static void write_stype(u8 *loc, u32 val) {
  *(ul32 *)loc &= 0b000000'11111'11111'111'00000'1111111;
  *(ul32 *)loc |= bits(val, 11, 5) << 25 | bits(val, 4, 0) << 7;
}

static void write_btype(u8 *loc, u32 val) {
  *(ul32 *)loc &= 0b000000'11111'11111'111'00000'1111111;
  *(ul32 *)loc |= bit(val, 12) << 31   | bits(val, 10, 5) << 25 |
                  bits(val, 4, 1) << 8 | bit(val, 11) << 7;
}

static void write_utype(u8 *loc, u32 val) {
  *(ul32 *)loc &= 0b000000'00000'00000'000'11111'1111111;

  // U-type instructions are used in combination with I-type
  // instructions. U-type insn sets an immediate to the upper 20-bits
  // of a register. I-type insn sign-extends a 12-bits immediate and
  // adds it to a register value to construct a complete value. 0x800
  // is added here to compensate for the sign-extension.
  *(ul32 *)loc |= (val + 0x800) & 0xffff'f000;
}

static void write_jtype(u8 *loc, u32 val) {
  *(ul32 *)loc &= 0b000000'00000'00000'000'11111'1111111;
  *(ul32 *)loc |= bit(val, 20) << 31 | bits(val, 10, 1)  << 21 |
                  bit(val, 11) << 20 | bits(val, 19, 12) << 12;
}

static void write_citype(u8 *loc, u32 val) {
  *(ul16 *)loc &= 0b111'0'11111'00000'11;
  *(ul16 *)loc |= bit(val, 5) << 12 | bits(val, 4, 0) << 2;
}

static void write_cbtype(u8 *loc, u32 val) {
  *(ul16 *)loc &= 0b111'000'111'00000'11;
  *(ul16 *)loc |= bit(val, 8) << 12 | bit(val, 4) << 11 | bit(val, 3) << 10 |
                  bit(val, 7) << 6  | bit(val, 6) << 5  | bit(val, 2) << 4  |
                  bit(val, 1) << 3  | bit(val, 5) << 2;
}

static void write_cjtype(u8 *loc, u32 val) {
  *(ul16 *)loc &= 0b111'00000000000'11;
  *(ul16 *)loc |= bit(val, 11) << 12 | bit(val, 4)  << 11 | bit(val, 9) << 10 |
                  bit(val, 8)  << 9  | bit(val, 10) << 8  | bit(val, 6) << 7  |
                  bit(val, 7)  << 6  | bit(val, 3)  << 5  | bit(val, 2) << 4  |
                  bit(val, 1)  << 3  | bit(val, 5)  << 2;
}

static void set_rs1(u8 *loc, u32 rs1) {
  assert(rs1 < 32);
  *(ul32 *)loc &= 0b111111'11111'00000'111'11111'1111111;
  *(ul32 *)loc |= rs1 << 15;
}

template <>
void write_plt_header<E>(Context<E> &ctx, u8 *buf) {
  static const ul32 insn_64[] = {
    0x0000'0397, // auipc  t2, %pcrel_hi(.got.plt)
    0x41c3'0333, // sub    t1, t1, t3               # .plt entry + hdr + 12
    0x0003'be03, // ld     t3, %pcrel_lo(1b)(t2)    # _dl_runtime_resolve
    0xfd43'0313, // addi   t1, t1, -44              # .plt entry
    0x0003'8293, // addi   t0, t2, %pcrel_lo(1b)    # &.got.plt
    0x0013'5313, // srli   t1, t1, 1                # .plt entry offset
    0x0082'b283, // ld     t0, 8(t0)                # link map
    0x000e'0067, // jr     t3
  };

  static const ul32 insn_32[] = {
    0x0000'0397, // auipc  t2, %pcrel_hi(.got.plt)
    0x41c3'0333, // sub    t1, t1, t3               # .plt entry + hdr + 12
    0x0003'ae03, // lw     t3, %pcrel_lo(1b)(t2)    # _dl_runtime_resolve
    0xfd43'0313, // addi   t1, t1, -44              # .plt entry
    0x0003'8293, // addi   t0, t2, %pcrel_lo(1b)    # &.got.plt
    0x0023'5313, // srli   t1, t1, 2                # .plt entry offset
    0x0042'a283, // lw     t0, 4(t0)                # link map
    0x000e'0067, // jr     t3
  };

  u64 gotplt = ctx.gotplt->shdr.sh_addr;
  u64 plt = ctx.plt->shdr.sh_addr;

  memcpy(buf, E::is_64 ? insn_64 : insn_32, E::plt_hdr_size);
  write_utype(buf, gotplt - plt);
  write_itype(buf + 8, gotplt - plt);
  write_itype(buf + 16, gotplt - plt);
}

static const ul32 plt_entry_64[] = {
  0x0000'0e17, // auipc   t3, %pcrel_hi(function@.got.plt)
  0x000e'3e03, // ld      t3, %pcrel_lo(1b)(t3)
  0x000e'0367, // jalr    t1, t3
  0x0000'0013, // nop
};

static const ul32 plt_entry_32[] = {
  0x0000'0e17, // auipc   t3, %pcrel_hi(function@.got.plt)
  0x000e'2e03, // lw      t3, %pcrel_lo(1b)(t3)
  0x000e'0367, // jalr    t1, t3
  0x0000'0013, // nop
};

template <>
void write_plt_entry<E>(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
  u64 gotplt = sym.get_gotplt_addr(ctx);
  u64 plt = sym.get_plt_addr(ctx);

  memcpy(buf, E::is_64 ? plt_entry_64 : plt_entry_32, E::plt_size);
  write_utype(buf, gotplt - plt);
  write_itype(buf + 4, gotplt - plt);
}

template <>
void write_pltgot_entry<E>(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
  u64 got = sym.get_got_pltgot_addr(ctx);
  u64 plt = sym.get_plt_addr(ctx);

  memcpy(buf, E::is_64 ? plt_entry_64 : plt_entry_32, E::plt_size);
  write_utype(buf, got - plt);
  write_itype(buf + 4, got - plt);
}

template <>
void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
                                       u64 offset, u64 val) {
  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;

  switch (rel.r_type) {
  case R_NONE:
    break;
  case R_RISCV_ADD32:
    *(U32<E> *)loc += val;
    break;
  case R_RISCV_SUB8:
    *loc -= val;
    break;
  case R_RISCV_SUB16:
    *(U16<E> *)loc -= val;
    break;
  case R_RISCV_SUB32:
    *(U32<E> *)loc -= val;
    break;
  case R_RISCV_SUB6:
    *loc = (*loc & 0b1100'0000) | ((*loc - val) & 0b0011'1111);
    break;
  case R_RISCV_SET6:
    *loc = (*loc & 0b1100'0000) | (val & 0b0011'1111);
    break;
  case R_RISCV_SET8:
    *loc = val;
    break;
  case R_RISCV_SET16:
    *(U16<E> *)loc = val;
    break;
  case R_RISCV_SET32:
    *(U32<E> *)loc = val;
    break;
  case R_RISCV_32_PCREL:
    *(U32<E> *)loc = val - this->shdr.sh_addr - offset;
    break;
  default:
    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
  }
}

static inline bool is_hi20(const ElfRel<E> &rel) {
  u32 ty = rel.r_type;
  return ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 ||
         ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20 ||
         ty == R_RISCV_TLSDESC_HI20;
}

template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
  std::span<const ElfRel<E>> rels = get_rels(ctx);
  u64 GP = ctx.__global_pointer ? ctx.__global_pointer->get_addr(ctx) : 0;

  ElfRel<E> *dynrel = nullptr;
  if (ctx.reldyn)
    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
                           file.reldyn_offset + this->reldyn_offset);

  auto get_r_delta = [&](i64 idx) {
    return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
  };

  for (i64 i = 0; i < rels.size(); i++) {
    const ElfRel<E> &rel = rels[i];
    if (rel.r_type == R_NONE || rel.r_type == R_RISCV_RELAX)
      continue;

    Symbol<E> &sym = *file.symbols[rel.r_sym];
    i64 r_offset = rel.r_offset - get_r_delta(i);
    i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i);
    u8 *loc = base + r_offset;

    auto check = [&](i64 val, i64 lo, i64 hi) {
      if (val < lo || hi <= val)
        Error(ctx) << *this << ": relocation " << rel << " against "
                   << sym << " out of range: " << val << " is not in ["
                   << lo << ", " << hi << ")";
    };

    auto find_paired_reloc = [&] {
      if (sym.value <= rels[i].r_offset - get_r_delta(i)) {
        for (i64 j = i - 1; j >= 0; j--)
          if (is_hi20(rels[j]) && sym.value == rels[j].r_offset - get_r_delta(j))
            return j;
      } else {
        for (i64 j = i + 1; j < rels.size(); j++)
          if (is_hi20(rels[j]) && sym.value == rels[j].r_offset - get_r_delta(j))
            return j;
      }

      Fatal(ctx) << *this << ": paired relocation is missing: " << i;
    };

    auto get_rd = [&](i64 offset) {
      // Returns the rd register of an R/I/U/J-type instruction.
      return bits(*(ul32 *)(contents.data() + offset), 11, 7);
    };

    u64 S = sym.get_addr(ctx);
    u64 A = rel.r_addend;
    u64 P = get_addr() + r_offset;
    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
    u64 GOT = ctx.got->shdr.sh_addr;

    switch (rel.r_type) {
    case R_RISCV_32:
      if constexpr (E::is_64)
        *(U32<E> *)loc = S + A;
      else
        apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
      break;
    case R_RISCV_64:
      assert(E::is_64);
      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, &dynrel);
      break;
    case R_RISCV_BRANCH:
      check(S + A - P, -(1 << 12), 1 << 12);
      write_btype(loc, S + A - P);
      break;
    case R_RISCV_JAL:
      check(S + A - P, -(1 << 20), 1 << 20);
      write_jtype(loc, S + A - P);
      break;
    case R_RISCV_CALL:
    case R_RISCV_CALL_PLT: {
      i64 val = S + A - P;
      i64 rd = get_rd(rel.r_offset + 4);

      // Calling an undefined weak symbol does not make sense.
      // We make such call into an infinite loop. This should
      // help debugging of a faulty program.
      if (sym.esym().is_undef_weak())
        val = 0;

      if (removed_bytes == 4) {
        // auipc + jalr -> jal
        *(ul32 *)loc = (rd << 7) | 0b1101111;
        write_jtype(loc, val);
      } else if (removed_bytes == 6 && rd == 0) {
        // auipc + jalr -> c.j
        *(ul16 *)loc = 0b101'00000000000'01;
        write_cjtype(loc, val);
      } else if (removed_bytes == 6 && rd == 1) {
        // auipc + jalr -> c.jal
        assert(!E::is_64);
        *(ul16 *)loc = 0b001'00000000000'01;
        write_cjtype(loc, val);
      } else {
        assert(removed_bytes == 0);
        check(val, -(1LL << 31), 1LL << 31);
        write_utype(loc, val);
        write_itype(loc + 4, val);
      }
      break;
    }
    case R_RISCV_GOT_HI20: {
      // This relocation usually refers to an AUIPC + LD instruction
      // pair to load a symbol value from the GOT. If the symbol value
      // is actually a link-time constant, we can materialize the value
      // directly into a register to eliminate a memory load.
      i64 rd = get_rd(rel.r_offset);

      switch (removed_bytes) {
      case 6:
        // c.li <rd>, val
        *(ul16 *)loc = 0b010'0'00000'00000'01 | (rd << 7);
        write_citype(loc, sym.get_addr(ctx));
        i += 3;
        break;
      case 4:
        // addi <rd>, zero, val
        *(ul32 *)loc = 0b0010011 | (rd << 7);
        write_itype(loc, sym.get_addr(ctx));
        i += 3;
        break;
      case 0:
        if (ctx.arg.relax &&
            sym.is_pcrel_linktime_const(ctx) &&
            i + 3 < rels.size() &&
            rels[i + 1].r_type == R_RISCV_RELAX &&
            rels[i + 2].r_type == R_RISCV_PCREL_LO12_I &&
            rels[i + 2].r_offset == rels[i].r_offset + 4 &&
            file.symbols[rels[i + 2].r_sym]->value == r_offset &&
            rels[i + 3].r_type == R_RISCV_RELAX) {
          i64 val = S + A - P;
          if (rd == get_rd(rel.r_offset + 4) && (i32)val == val) {
            // auipc <rd>, %hi20(val)
            write_utype(loc, val);

            // addi <rd>, <rd>, %lo12(val)
            *(ul32 *)(loc + 4) = 0b0010011 | (rd << 15) | (rd << 7);
            write_itype(loc + 4, val);
            i += 3;
            break;
          }
        }

        write_utype(loc, G + GOT + A - P);
        break;
      default:
        unreachable();
      }
      break;
    }
    case R_RISCV_TLS_GOT_HI20:
      write_utype(loc, sym.get_gottp_addr(ctx) + A - P);
      break;
    case R_RISCV_TLS_GD_HI20:
      write_utype(loc, sym.get_tlsgd_addr(ctx) + A - P);
      break;
    case R_RISCV_PCREL_HI20:
      write_utype(loc, S + A - P);
      break;
    case R_RISCV_PCREL_LO12_I:
    case R_RISCV_PCREL_LO12_S:
    case R_RISCV_GPREL_LO12_I:
    case R_RISCV_GPREL_LO12_S: {
      i64 idx2 = find_paired_reloc();
      const ElfRel<E> &rel2 = rels[idx2];
      Symbol<E> &sym2 = *file.symbols[rel2.r_sym];

      u64 S = sym2.get_addr(ctx);
      u64 A = rel2.r_addend;
      u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2);
      u64 G = sym2.get_got_idx(ctx) * sizeof(Word<E>);
      u64 val;

      switch (rel2.r_type) {
      case R_RISCV_GOT_HI20:
        val = G + GOT + A - P;
        break;
      case R_RISCV_TLS_GOT_HI20:
        val = sym2.get_gottp_addr(ctx) + A - P;
        break;
      case R_RISCV_TLS_GD_HI20:
        val = sym2.get_tlsgd_addr(ctx) + A - P;
        break;
      case R_RISCV_PCREL_HI20:
        val = S + A - P;
        break;
      case R_RISCV_GPREL_HI20:
        val = S + A - GP;
        break;
      default:
        unreachable();
      }

      if (rel.r_type == R_RISCV_PCREL_LO12_I ||
          rel.r_type == R_RISCV_GPREL_LO12_I)
        write_itype(loc, val);
      else
        write_stype(loc, val);
      break;
    }
    case R_RISCV_HI20:
      if (removed_bytes == 2) {
        // Rewrite LUI with C.LUI
        i64 rd = get_rd(rel.r_offset);
        *(ul16 *)loc = 0b011'0'00000'00000'01 | (rd << 7);
        write_citype(loc, (S + A + 0x800) >> 12);
      } else if (removed_bytes == 0) {
        check(S + A, -(1LL << 31), 1LL << 31);
        write_utype(loc, S + A);
      }
      break;
    case R_RISCV_LO12_I:
    case R_RISCV_LO12_S:
      if (rel.r_type == R_RISCV_LO12_I)
        write_itype(loc, S + A);
      else
        write_stype(loc, S + A);

      // Rewrite `lw t1, 0(t0)` with `lw t1, 0(x0)` if the address is
      // accessible relative to the zero register because if that's the
      // case, corresponding LUI might have been removed by relaxation.
      if (sign_extend(S + A, 11) == S + A)
        set_rs1(loc, 0);
      break;
    case R_RISCV_TPREL_HI20:
      assert(removed_bytes == 0 || removed_bytes == 4);
      if (removed_bytes == 0)
        write_utype(loc, S + A - ctx.tp_addr);
      break;
    case R_RISCV_TPREL_ADD:
      // This relocation just annotates an ADD instruction that can be
      // removed when a TPREL is relaxed. No value is needed to be
      // written.
      assert(removed_bytes == 0 || removed_bytes == 4);
      break;
    case R_RISCV_TPREL_LO12_I:
    case R_RISCV_TPREL_LO12_S: {
      i64 val = S + A - ctx.tp_addr;
      if (rel.r_type == R_RISCV_TPREL_LO12_I)
        write_itype(loc, val);
      else
        write_stype(loc, val);

      // Rewrite `lw t1, 0(t0)` with `lw t1, 0(tp)` if the address is
      // directly accessible using tp. tp is x4.
      if (sign_extend(val, 11) == val)
        set_rs1(loc, 4);
      break;
    }
    case R_RISCV_TLSDESC_HI20:
      // RISC-V TLSDESC uses the following code sequence to materialize
      // a TP-relative address in x0.
      //
      //   .L0:
      //   auipc  tX, 0
      //       R_RISCV_TLSDESC_HI20         foo
      //   l[d|w] tY, tX, 0
      //       R_RISCV_TLSDESC_LOAD_LO12_I  .L0
      //   addi   a0, tX, 0
      //       R_RISCV_TLSDESC_ADD_LO12_I   .L0
      //   jalr   t0, tY
      //       R_RISCV_TLSDESC_CALL         .L0
      //
      // For non-dlopen'd DSO, we may relax the instructions to the following:
      //
      //   <deleted>
      //   <deleted>
      //   auipc  a0, %gottp_hi(a0)
      //   l[d|w] a0, %gottp_lo(a0)
      //
      // For executable, if the TP offset is small enough, we'll relax
      // it to the following:
      //
      //   <deleted>
      //   <deleted>
      //   <deleted>
      //   addi   a0, zero, %tpoff_lo(a0)
      //
      // Otherwise, the following sequence is used:
      //
      //   <deleted>
      //   <deleted>
      //   lui    a0, %tpoff_hi(a0)
      //   addi   a0, a0, %tpoff_lo(a0)
      if (removed_bytes == 0)
        write_utype(loc, sym.get_tlsdesc_addr(ctx) + A - P);
      break;
    case R_RISCV_TLSDESC_LOAD_LO12:
    case R_RISCV_TLSDESC_ADD_LO12:
    case R_RISCV_TLSDESC_CALL: {
      if (removed_bytes == 4)
        break;

      i64 idx2 = find_paired_reloc();
      const ElfRel<E> &rel2 = rels[idx2];
      Symbol<E> &sym2 = *file.symbols[rel2.r_sym];

      u64 S = sym2.get_addr(ctx);
      u64 A = rel2.r_addend;
      u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2);

      switch (rel.r_type) {
      case R_RISCV_TLSDESC_LOAD_LO12:
        write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P);
        break;
      case R_RISCV_TLSDESC_ADD_LO12:
        if (sym2.has_tlsdesc(ctx)) {
          write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P);
        } else if (sym2.has_gottp(ctx)) {
          *(ul32 *)loc = 0x517; // auipc a0,<hi20>
          write_utype(loc, sym2.get_gottp_addr(ctx) + A - P);
        } else {
          *(ul32 *)loc = 0x537; // lui a0,<hi20>
          write_utype(loc, S + A - ctx.tp_addr);
        }
        break;
      case R_RISCV_TLSDESC_CALL:
        if (sym2.has_tlsdesc(ctx)) {
          // Do nothing
        } else if (sym2.has_gottp(ctx)) {
          // l[d|w] a0,<lo12>
          *(ul32 *)loc = E::is_64 ? 0x53503 : 0x52503;
          write_itype(loc, sym2.get_gottp_addr(ctx) + A - P);
        } else {
          i64 val = S + A - ctx.tp_addr;
          if (sign_extend(val, 11) == val)
            *(ul32 *)loc = 0x513;   // addi a0,zero,<lo12>
          else
            *(ul32 *)loc = 0x50513; // addi a0,a0,<lo12>
          write_itype(loc, val);
        }
        break;
      }
      break;
    }
    case R_RISCV_ADD8:
      loc += S + A;
      break;
    case R_RISCV_ADD16:
      *(U16<E> *)loc += S + A;
      break;
    case R_RISCV_ADD32:
      *(U32<E> *)loc += S + A;
      break;
    case R_RISCV_ADD64:
      *(U64<E> *)loc += S + A;
      break;
    case R_RISCV_SUB8:
      loc -= S + A;
      break;
    case R_RISCV_SUB16:
      *(U16<E> *)loc -= S + A;
      break;
    case R_RISCV_SUB32:
      *(U32<E> *)loc -= S + A;
      break;
    case R_RISCV_SUB64:
      *(U64<E> *)loc -= S + A;
      break;
    case R_RISCV_ALIGN: {
      // A R_RISCV_ALIGN is followed by a NOP sequence. We need to remove
      // zero or more bytes so that the instruction after R_RISCV_ALIGN is
      // aligned to a given alignment boundary.
      //
      // We need to guarantee that the NOP sequence is valid after byte
      // removal (e.g. we can't remove the first 2 bytes of a 4-byte NOP).
      // For the sake of simplicity, we always rewrite the entire NOP sequence.
      i64 padding_bytes = rel.r_addend - removed_bytes;
      assert((padding_bytes & 1) == 0);

      i64 i = 0;
      for (; i <= padding_bytes - 4; i += 4)
        *(ul32 *)(loc + i) = 0x0000'0013; // nop
      if (i < padding_bytes)
        *(ul16 *)(loc + i) = 0x0001;      // c.nop
      break;
    }
    case R_RISCV_RVC_BRANCH:
      check(S + A - P, -(1 << 8), 1 << 8);
      write_cbtype(loc, S + A - P);
      break;
    case R_RISCV_RVC_JUMP:
      check(S + A - P, -(1 << 11), 1 << 11);
      write_cjtype(loc, S + A - P);
      break;
    case R_RISCV_GPREL_HI20:
      check(S + A - GP, -(1LL << 31), 1LL << 31);
      write_utype(loc, S + A - GP);
      break;
    case R_RISCV_SUB6:
      *loc = (*loc & 0b1100'0000) | ((*loc - S - A) & 0b0011'1111);
      break;
    case R_RISCV_SET6:
      *loc = (*loc & 0b1100'0000) | ((S + A) & 0b0011'1111);
      break;
    case R_RISCV_SET8:
      *loc = S + A;
      break;
    case R_RISCV_SET16:
      *(U16<E> *)loc = S + A;
      break;
    case R_RISCV_SET32:
      *(U32<E> *)loc = S + A;
      break;
    case R_RISCV_PLT32:
    case R_RISCV_32_PCREL:
      *(U32<E> *)loc = S + A - P;
      break;
    case R_RISCV_SET_ULEB128:
      overwrite_uleb(loc, S + A);
      break;
    case R_RISCV_SUB_ULEB128:
      overwrite_uleb(loc, read_uleb(loc) - S - A);
      break;
    default:
      unreachable();
    }
  }
}

template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
  std::span<const ElfRel<E>> rels = get_rels(ctx);

  for (i64 i = 0; i < rels.size(); i++) {
    const ElfRel<E> &rel = rels[i];
    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
      continue;

    Symbol<E> &sym = *file.symbols[rel.r_sym];
    u8 *loc = base + rel.r_offset;

    SectionFragment<E> *frag;
    i64 frag_addend;
    std::tie(frag, frag_addend) = get_fragment(ctx, rel);

    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
    u64 A = frag ? frag_addend : (i64)rel.r_addend;

    switch (rel.r_type) {
    case R_RISCV_32:
      *(U32<E> *)loc = S + A;
      break;
    case R_RISCV_64:
      if (std::optional<u64> val = get_tombstone(sym, frag))
        *(U64<E> *)loc = *val;
      else
        *(U64<E> *)loc = S + A;
      break;
    case R_RISCV_ADD8:
      *loc += S + A;
      break;
    case R_RISCV_ADD16:
      *(U16<E> *)loc += S + A;
      break;
    case R_RISCV_ADD32:
      *(U32<E> *)loc += S + A;
      break;
    case R_RISCV_ADD64:
      *(U64<E> *)loc += S + A;
      break;
    case R_RISCV_SUB8:
      *loc -= S + A;
      break;
    case R_RISCV_SUB16:
      *(U16<E> *)loc -= S + A;
      break;
    case R_RISCV_SUB32:
      *(U32<E> *)loc -= S + A;
      break;
    case R_RISCV_SUB64:
      *(U64<E> *)loc -= S + A;
      break;
    case R_RISCV_SUB6:
      *loc = (*loc & 0b1100'0000) | ((*loc - S - A) & 0b0011'1111);
      break;
    case R_RISCV_SET6:
      *loc = (*loc & 0b1100'0000) | ((S + A) & 0b0011'1111);
      break;
    case R_RISCV_SET8:
      *loc = S + A;
      break;
    case R_RISCV_SET16:
      *(U16<E> *)loc = S + A;
      break;
    case R_RISCV_SET32:
      *(U32<E> *)loc = S + A;
      break;
    case R_RISCV_SET_ULEB128:
      overwrite_uleb(loc, S + A);
      break;
    case R_RISCV_SUB_ULEB128:
      overwrite_uleb(loc, read_uleb(loc) - S - A);
      break;
    default:
      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
                 << rel;
      break;
    }
  }
}

template <>
void InputSection<E>::copy_contents_riscv(Context<E> &ctx, u8 *buf) {
  // If a section is not relaxed, we can copy it as a one big chunk.
  if (extra.r_deltas.empty()) {
    uncompress_to(ctx, buf);
    return;
  }

  // A relaxed section is copied piece-wise.
  std::span<const ElfRel<E>> rels = get_rels(ctx);
  i64 pos = 0;

  for (i64 i = 0; i < rels.size(); i++) {
    i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i];
    if (delta == 0)
      continue;
    assert(delta > 0);

    const ElfRel<E> &r = rels[i];
    memcpy(buf, contents.data() + pos, r.r_offset - pos);
    buf += r.r_offset - pos;
    pos = r.r_offset + delta;
  }

  memcpy(buf, contents.data() + pos, contents.size() - pos);
}

template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
  assert(shdr().sh_flags & SHF_ALLOC);

  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
  std::span<const ElfRel<E>> rels = get_rels(ctx);

  // Scan relocations
  for (i64 i = 0; i < rels.size(); i++) {
    const ElfRel<E> &rel = rels[i];
    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
      continue;

    Symbol<E> &sym = *file.symbols[rel.r_sym];

    if (sym.is_ifunc())
      sym.flags |= NEEDS_GOT | NEEDS_PLT;

    switch (rel.r_type) {
    case R_RISCV_32:
      if constexpr (E::is_64)
        scan_absrel(ctx, sym, rel);
      else
        scan_dyn_absrel(ctx, sym, rel);
      break;
    case R_RISCV_HI20:
      scan_absrel(ctx, sym, rel);
      break;
    case R_RISCV_64:
      if constexpr (!E::is_64)
        Fatal(ctx) << *this << ": R_RISCV_64 cannot be used on RV32";
      scan_dyn_absrel(ctx, sym, rel);
      break;
    case R_RISCV_CALL:
    case R_RISCV_CALL_PLT:
    case R_RISCV_PLT32:
      if (sym.is_imported)
        sym.flags |= NEEDS_PLT;
      break;
    case R_RISCV_GOT_HI20:
      sym.flags |= NEEDS_GOT;
      break;
    case R_RISCV_TLS_GOT_HI20:
      sym.flags |= NEEDS_GOTTP;
      break;
    case R_RISCV_TLS_GD_HI20:
      sym.flags |= NEEDS_TLSGD;
      break;
    case R_RISCV_TLSDESC_HI20:
      scan_tlsdesc(ctx, sym);
      break;
    case R_RISCV_32_PCREL:
    case R_RISCV_PCREL_HI20:
      scan_pcrel(ctx, sym, rel);
      break;
    case R_RISCV_TPREL_HI20:
    case R_RISCV_TPREL_LO12_I:
    case R_RISCV_TPREL_LO12_S:
    case R_RISCV_TPREL_ADD:
      check_tlsle(ctx, sym, rel);
      break;
    case R_RISCV_GPREL_HI20:
      if (ctx.arg.shared)
        Fatal(ctx) << *this << ": R_RISCV_GPREL_HI20 may not be used with -shared";
      break;
    case R_RISCV_BRANCH:
    case R_RISCV_JAL:
    case R_RISCV_PCREL_LO12_I:
    case R_RISCV_PCREL_LO12_S:
    case R_RISCV_LO12_I:
    case R_RISCV_LO12_S:
    case R_RISCV_TLSDESC_LOAD_LO12:
    case R_RISCV_TLSDESC_ADD_LO12:
    case R_RISCV_TLSDESC_CALL:
    case R_RISCV_ADD8:
    case R_RISCV_ADD16:
    case R_RISCV_ADD32:
    case R_RISCV_ADD64:
    case R_RISCV_SUB8:
    case R_RISCV_SUB16:
    case R_RISCV_SUB32:
    case R_RISCV_SUB64:
    case R_RISCV_ALIGN:
    case R_RISCV_RVC_BRANCH:
    case R_RISCV_RVC_JUMP:
    case R_RISCV_GPREL_LO12_I:
    case R_RISCV_GPREL_LO12_S:
    case R_RISCV_RELAX:
    case R_RISCV_SUB6:
    case R_RISCV_SET6:
    case R_RISCV_SET8:
    case R_RISCV_SET16:
    case R_RISCV_SET32:
    case R_RISCV_SET_ULEB128:
    case R_RISCV_SUB_ULEB128:
      break;
    default:
      Error(ctx) << *this << ": unknown relocation: " << rel;
    }
  }
}

template <>
u64 get_eflags(Context<E> &ctx) {
  std::vector<ObjectFile<E> *> objs = ctx.objs;
  std::erase(objs, ctx.internal_obj);

  if (objs.empty())
    return 0;

  u32 ret = objs[0]->get_ehdr().e_flags;
  for (i64 i = 1; i < objs.size(); i++) {
    u32 flags = objs[i]->get_ehdr().e_flags;
    if (flags & EF_RISCV_RVC)
      ret |= EF_RISCV_RVC;

    if ((flags & EF_RISCV_FLOAT_ABI) != (ret & EF_RISCV_FLOAT_ABI))
      Error(ctx) << *objs[i] << ": cannot link object files with different"
                 << " floating-point ABI from " << *objs[0];

    if ((flags & EF_RISCV_RVE) != (ret & EF_RISCV_RVE))
      Error(ctx) << *objs[i] << ": cannot link object files with different"
                 << " EF_RISCV_RVE from " << *objs[0];
  }
  return ret;
}

static bool is_resizable(InputSection<E> *isec) {
  return isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC) &&
         (isec->shdr().sh_flags & SHF_EXECINSTR);
}

// Returns the distance between a relocated place and a symbol.
static i64 compute_distance(Context<E> &ctx, Symbol<E> &sym,
                            InputSection<E> &isec, const ElfRel<E> &rel) {
  // We handle absolute symbols as if they were infinitely far away
  // because `shrink_section` may increase a distance between a branch
  // instruction and an absolute symbol. Branching to an absolute
  // location is extremely rare in real code, though.
  if (sym.is_absolute())
    return INT32_MAX;

  // Likewise, relocations against weak undefined symbols won't be relaxed.
  if (sym.esym().is_undef_weak())
    return INT32_MAX;

  // Compute a distance between the relocated place and the symbol.
  i64 S = sym.get_addr(ctx);
  i64 A = rel.r_addend;
  i64 P = isec.get_addr() + rel.r_offset;
  return S + A - P;
}

// Scan relocations to shrink sections.
static void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
  std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
  isec.extra.r_deltas.resize(rels.size() + 1);

  auto get_rd = [&](i64 offset) {
    return bits(*(ul32 *)(isec.contents.data() + offset), 11, 7);
  };

  i64 delta = 0;

  for (i64 i = 0; i < rels.size(); i++) {
    const ElfRel<E> &r = rels[i];
    Symbol<E> &sym = *isec.file.symbols[r.r_sym];
    isec.extra.r_deltas[i] = delta;

    // Handling R_RISCV_ALIGN is mandatory.
    //
    // R_RISCV_ALIGN refers to NOP instructions. We need to eliminate some
    // or all of the instructions so that the instruction that immediately
    // follows the NOPs is aligned to a specified alignment boundary.
    if (r.r_type == R_RISCV_ALIGN) {
      // The total bytes of NOPs is stored to r_addend, so the next
      // instruction is r_addend away.
      u64 loc = isec.get_addr() + r.r_offset - delta;
      u64 next_loc = loc + r.r_addend;
      u64 alignment = bit_ceil(r.r_addend + 1);
      assert(alignment <= (1 << isec.p2align));
      delta += next_loc - align_to(loc, alignment);
      continue;
    }

    // Handling other relocations is optional.
    if (!ctx.arg.relax || i == rels.size() - 1 ||
        rels[i + 1].r_type != R_RISCV_RELAX)
      continue;

    // Linker-synthesized symbols haven't been assigned their final
    // values when we are shrinking sections because actual values can
    // be computed only after we fix the file layout. Therefore, we
    // assume that relocations against such symbols are always
    // non-relaxable.
    if (sym.file == ctx.internal_obj)
      continue;

    auto find_paired_reloc = [&] {
      if (sym.value <= rels[i].r_offset) {
        for (i64 j = i - 1; j >= 0; j--)
          if (is_hi20(rels[j]) && sym.value == rels[j].r_offset)
            return j;
      } else {
        for (i64 j = i + 1; j < rels.size(); j++)
          if (is_hi20(rels[j]) && sym.value == rels[j].r_offset)
            return j;
      }

      Fatal(ctx) << isec << ": paired relocation is missing: " << i;
    };

    switch (r.r_type) {
    case R_RISCV_CALL:
    case R_RISCV_CALL_PLT: {
      // These relocations refer to an AUIPC + JALR instruction pair to
      // allow to jump to anywhere in PC ± 2 GiB. If the jump target is
      // close enough to PC, we can use C.J, C.JAL or JAL instead.
      i64 dist = compute_distance(ctx, sym, isec, r);
      if (dist & 1)
        break;

      i64 rd = get_rd(r.r_offset + 4);

      if (use_rvc && rd == 0 && sign_extend(dist, 11) == dist) {
        // If rd is x0 and the jump target is within ±2 KiB, we can use
        // C.J, saving 6 bytes.
        delta += 6;
      } else if (use_rvc && !E::is_64 && rd == 1 && sign_extend(dist, 11) == dist) {
        // If rd is x1 and the jump target is within ±2 KiB, we can use
        // C.JAL. This is RV32 only because C.JAL is RV32-only instruction.
        delta += 6;
      } else if (sign_extend(dist, 20) == dist) {
        // If the jump target is within ±1 MiB, we can use JAL.
        delta += 4;
      }
      break;
    }
    case R_RISCV_GOT_HI20: {
      // A GOT_HI20 followed by a PCREL_LO12_I is used to load a value from
      // GOT. If the loaded value is a link-time constant, we can rewrite
      // the instructions to directly materialize the value, eliminating a
      // memory load.
      if (sym.is_absolute() &&
          i + 3 < rels.size() &&
          rels[i + 1].r_type == R_RISCV_RELAX &&
          rels[i + 2].r_type == R_RISCV_PCREL_LO12_I &&
          rels[i + 2].r_offset == rels[i].r_offset + 4 &&
          isec.file.symbols[rels[i + 2].r_sym]->value == rels[i].r_offset &&
          rels[i + 3].r_type == R_RISCV_RELAX) {
        i64 rd = get_rd(r.r_offset);

        if (rd == get_rd(r.r_offset + 4)) {
          u64 val = sym.get_addr(ctx) + r.r_addend;

          if (use_rvc && rd != 0 && sign_extend(val, 5) == val) {
            // Replace AUIPC + LD with C.LI.
            delta += 6;
          } else if (sign_extend(val, 11) == val) {
            // Replace AUIPC + LD with ADDI.
            delta += 4;
          }
        }
      }
      break;
    }
    case R_RISCV_HI20: {
      u64 val = sym.get_addr(ctx) + r.r_addend;
      i64 rd = get_rd(r.r_offset);

      if (sign_extend(val, 11) == val) {
        // We can replace `lui t0, %hi(foo)` and `add t0, t0, %lo(foo)`
        // instruction pair with `add t0, x0, %lo(foo)` if foo's bits
        // [32:11] are all one or all zero.
        delta += 4;
      } else if (use_rvc && rd != 0 && rd != 2 && sign_extend(val, 17) == val) {
        // If the upper 20 bits can actually be represented in 6 bits,
        // we can use C.LUI instead of LUI.
        delta += 2;
      }
      break;
    }
    case R_RISCV_TPREL_HI20:
    case R_RISCV_TPREL_ADD:
      // These relocations are used to add a high 20-bit value to the
      // thread pointer. The following two instructions materializes
      // TP + %tprel_hi20(foo) in %t0, for example.
      //
      //  lui  t0, %tprel_hi(foo)         # R_RISCV_TPREL_HI20
      //  add  t0, t0, tp                 # R_RISCV_TPREL_ADD
      //
      // Then thread-local variable `foo` is accessed with the low
      // 12-bit offset like this:
      //
      //  sw   t0, %tprel_lo(foo)(t0)     # R_RISCV_TPREL_LO12_S
      //
      // However, if the variable is at TP ± 2 KiB, TP + %tprel_hi20(foo)
      // is the same as TP, so we can instead access the thread-local
      // variable directly using TP like this:
      //
      //  sw   t0, %tprel_lo(foo)(tp)
      //
      // Here, we remove `lui` and `add` if the offset is within ±2 KiB.
      if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr;
          sign_extend(val, 11) == val)
        delta += 4;
      break;
    case R_RISCV_TLSDESC_HI20:
      if (!sym.has_tlsdesc(ctx))
        delta += 4;
      break;
    case R_RISCV_TLSDESC_LOAD_LO12:
    case R_RISCV_TLSDESC_ADD_LO12: {
      const ElfRel<E> &rel2 = rels[find_paired_reloc()];
      Symbol<E> &sym2 = *isec.file.symbols[rel2.r_sym];

      if (r.r_type == R_RISCV_TLSDESC_LOAD_LO12) {
        if (!sym2.has_tlsdesc(ctx))
          delta += 4;
      } else {
        assert(r.r_type == R_RISCV_TLSDESC_ADD_LO12);
        if (!sym2.has_tlsdesc(ctx) && !sym2.has_gottp(ctx))
          if (i64 val = sym2.get_addr(ctx) + rel2.r_addend - ctx.tp_addr;
              sign_extend(val, 11) == val)
            delta += 4;
      }
      break;
    }
    }
  }

  isec.extra.r_deltas[rels.size()] = delta;
  isec.sh_size -= delta;
}

// Shrink sections by interpreting relocations.
//
// This operation seems to be optional, because by default longest
// instructions are being used. However, calling this function is actually
// mandatory because of R_RISCV_ALIGN. R_RISCV_ALIGN is a directive to the
// linker to align the location referred to by the relocation to a
// specified byte boundary. We at least have to interpret them to satisfy
// the alignment constraints.
template <>
i64 riscv_resize_sections<E>(Context<E> &ctx) {
  Timer t(ctx, "riscv_resize_sections");

  // True if we can use the 2-byte instructions. This is usually true on
  // Unix because RV64GC is generally considered the baseline hardware.
  bool use_rvc = get_eflags(ctx) & EF_RISCV_RVC;

  // Find all the relocations that can be relaxed.
  // This step should only shrink sections.
  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    for (std::unique_ptr<InputSection<E>> &isec : file->sections)
      if (is_resizable(isec.get()))
        shrink_section(ctx, *isec, use_rvc);
  });

  // Fix symbol values.
  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    for (Symbol<E> *sym : file->symbols) {
      if (sym->file != file)
        continue;

      InputSection<E> *isec = sym->get_input_section();
      if (!isec || isec->extra.r_deltas.empty())
        continue;

      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
      auto it = std::lower_bound(rels.begin(), rels.end(), sym->value,
                                 [&](const ElfRel<E> &r, u64 val) {
        return r.r_offset < val;
      });

      sym->value -= isec->extra.r_deltas[it - rels.begin()];
    }
  });

  // Re-compute section offset again to finalize them.
  compute_section_sizes(ctx);
  return set_osec_offsets(ctx);
}

// ISA name handlers
//
// An example of ISA name is "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0".
// An ISA name starts with the base name (e.g. "rv64i2p1") followed by
// ISA extensions separated by underscores.
//
// There are lots of ISA extensions defined for RISC-V, and they are
// identified by name. Some extensions are of single-letter alphabet such
// as "m" or "q". Newer extension names start with "z" followed by one or
// more alphabets (i.e. "zicsr"). "s" and "x" prefixes are reserved
// for supervisor-level extensions and private extensions, respectively.
//
// Each extension consists of a name, a major version and a minor version.
// For example, "m2p0" indicates the "m" extension of version 2.0. "p" is
// just a separator. Versions are often omitted in documents, but they are
// mandatory in .riscv.attributes. Likewise, abbreviations as "g" (which
// is short for "IMAFD") are not allowed in .riscv.attributes.
//
// Each RISC-V object file contains an ISA string enumerating extensions
// used by the object file. We need to merge input objects' ISA strings
// into a single ISA string.
//
// In order to guarantee string uniqueness, extensions have to be ordered
// in a specific manner. The exact rule is unfortunately a bit complicated.
//
// The following functions takes care of ISA strings.

namespace {
struct Extn {
  std::string name;
  i64 major;
  i64 minor;
};
}

// As per the RISC-V spec, the extension names must be sorted in a very
// specific way, and unfortunately that's not just an alphabetical order.
// For example, rv64imafd is a legal ISA string, whereas rv64iafdm is not.
// The exact rule is somewhat arbitrary.
//
// This function returns true if the first extension name should precede
// the second one as per the rule.
static bool extn_name_less(const Extn &e1, const Extn &e2) {
  auto get_single_letter_rank = [](char c) -> i64 {
    std::string_view exts = "iemafdqlcbkjtpvnh";
    size_t pos = exts.find_first_of(c);
    if (pos != exts.npos)
      return pos;
    return c - 'a' + exts.size();
  };

  auto get_rank = [&](std::string_view str) -> i64 {
    switch (str[0]) {
    case 'x':
      return 1 << 20;
    case 's':
      return 1 << 19;
    case 'z':
      return (1 << 18) + get_single_letter_rank(str[1]);
    default:
      return get_single_letter_rank(str[0]);
    }
  };

  return std::tuple{get_rank(e1.name), e1.name} <
         std::tuple{get_rank(e2.name), e2.name};
}

static bool extn_version_less(const Extn &e1, const Extn &e2) {
  return std::tuple{e1.major, e1.minor} <
         std::tuple{e2.major, e2.minor};
}

static std::optional<Extn> read_extn_string(std::string_view &str) {
  auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript;
  static std::regex re(R"(^([a-z]+)(\d+)p(\d+))", flags);

  std::cmatch m;
  if (std::regex_search(str.data(), str.data() + str.size(), m, re)) {
    str = str.substr(m.length());
    return Extn{m[1], (i64)std::stoul(m[2]), (i64)std::stoul(m[3])};
  }
  return {};
}

static std::vector<Extn> parse_arch_string(std::string_view str) {
  if (str.size() < 5)
    return {};

  // Parse the base part
  std::string_view base = str.substr(0, 5);
  if (base != "rv32i" && base != "rv32e" && base != "rv64i" && base != "rv64e")
    return {};
  str = str.substr(4);

  std::optional<Extn> extn = read_extn_string(str);
  if (!extn)
    return {};

  std::vector<Extn> vec;
  extn->name = base;
  vec.push_back(*extn);

  // Parse extensions
  while (!str.empty()) {
    if (str[0] != '_')
      return {};
    str = str.substr(1);

    std::optional<Extn> extn = read_extn_string(str);
    if (!extn)
      return {};
    vec.push_back(*extn);
  }
  return vec;
}

static std::vector<Extn> merge_extensions(std::span<Extn> x, std::span<Extn> y) {
  std::vector<Extn> vec;

  // The base part (i.e. "rv64i" or "rv32i") must match.
  if (x[0].name != y[0].name)
    return {};

  // Merge ISA extension strings
  while (!x.empty() && !y.empty()) {
    if (x[0].name == y[0].name) {
      vec.push_back(extn_version_less(x[0], y[0]) ? y[0] : x[0]);
      x = x.subspan(1);
      y = y.subspan(1);
    } else if (extn_name_less(x[0], y[0])) {
      vec.push_back(x[0]);
      x = x.subspan(1);
    } else {
      vec.push_back(y[0]);
      y = y.subspan(1);
    }
  }

  vec.insert(vec.end(), x.begin(), x.end());
  vec.insert(vec.end(), y.begin(), y.end());
  return vec;
}

static std::string to_string(std::span<Extn> v) {
  std::string str = v[0].name + std::to_string(v[0].major) + "p" +
                    std::to_string(v[0].minor);

  for (i64 i = 1; i < v.size(); i++)
    str += "_" + v[i].name + std::to_string(v[i].major) + "p" +
           std::to_string(v[i].minor);
  return str;
}

//
// Output .riscv.attributes class
//

template <>
void RiscvAttributesSection<E>::update_shdr(Context<E> &ctx) {
  if (!contents.empty())
    return;

  i64 stack = -1;
  std::vector<Extn> arch;
  bool unaligned = false;

  for (ObjectFile<E> *file : ctx.objs) {
    if (file->extra.stack_align) {
      i64 val = *file->extra.stack_align;
      if (stack != -1 && stack != val)
        Error(ctx) << *file << ": stack alignment requirement mistmatch";
      stack = val;
    }

    if (file->extra.arch) {
      std::vector<Extn> arch2 = parse_arch_string(*file->extra.arch);
      if (arch2.empty())
        Error(ctx) << *file << ": corrupted .riscv.attributes ISA string: "
                   << *file->extra.arch;

      if (arch.empty()) {
        arch = arch2;
      } else {
        arch = merge_extensions(arch, arch2);
        if (arch.empty())
          Error(ctx) << *file << ": incompatible .riscv.attributes ISA string: "
                     << *file->extra.arch;
      }
    }

    if (file->extra.unaligned_access)
      unaligned = true;
  }

  if (arch.empty())
    return;

  std::string arch_str = to_string(arch);
  contents.resize(arch_str.size() + 100);

  u8 *p = (u8 *)contents.data();
  *p++ = 'A';                             // Format version
  U32<E> *sub_sz = (U32<E> *)p;           // Sub-section length
  p += 4;
  p += write_string(p, "riscv");          // Vendor name
  u8 *sub_sub_start = p;
  *p++ = ELF_TAG_FILE;                    // Sub-section tag
  U32<E> *sub_sub_sz = (U32<E> *)p;       // Sub-sub-section length
  p += 4;

  if (stack != -1) {
    p += write_uleb(p, ELF_TAG_RISCV_STACK_ALIGN);
    p += write_uleb(p, stack);
  }

  p += write_uleb(p, ELF_TAG_RISCV_ARCH);
  p += write_string(p, arch_str);

  if (unaligned) {
    p += write_uleb(p, ELF_TAG_RISCV_UNALIGNED_ACCESS);
    p += write_uleb(p, 1);
  }

  i64 sz = p - (u8 *)contents.data();
  *sub_sz = sz - 1;
  *sub_sub_sz = p - sub_sub_start;
  contents.resize(sz);
  this->shdr.sh_size = sz;
}

template <>
void RiscvAttributesSection<E>::copy_buf(Context<E> &ctx) {
  memcpy(ctx.buf + this->shdr.sh_offset, contents.data(), contents.size());
}

} // namespace mold::elf

#endif