mold/elf/passes.cc

#include "mold.h"

#include <fstream>
#include <functional>
#include <map>
#include <optional>
#include <regex>
#include <shared_mutex>
#include <tbb/parallel_for_each.h>
#include <tbb/parallel_sort.h>
#include <tbb/partitioner.h>
#include <unordered_set>

namespace mold::elf {

// Since elf_main is a template, we can't run it without a type parameter.
// We speculatively run elf_main with X86_64, and if the speculation was
// wrong, re-run it with an actual machine type.
template <typename E>
int redo_main(Context<E> &ctx, int argc, char **argv) {
  std::string_view target = ctx.arg.emulation;

  if (target == I386::target_name)
    return elf_main<I386>(argc, argv);
  if (target == ARM64::target_name)
    return elf_main<ARM64>(argc, argv);
  if (target == ARM32::target_name)
    return elf_main<ARM32>(argc, argv);
  if (target == RV64LE::target_name)
    return elf_main<RV64LE>(argc, argv);
  if (target == RV64BE::target_name)
    return elf_main<RV64BE>(argc, argv);
  if (target == RV32LE::target_name)
    return elf_main<RV32LE>(argc, argv);
  if (target == RV32BE::target_name)
    return elf_main<RV32BE>(argc, argv);
  if (target == PPC32::target_name)
    return elf_main<PPC32>(argc, argv);
  if (target == PPC64V1::target_name)
    return elf_main<PPC64V1>(argc, argv);
  if (target == PPC64V2::target_name)
    return elf_main<PPC64V2>(argc, argv);
  if (target == S390X::target_name)
    return elf_main<S390X>(argc, argv);
  if (target == SPARC64::target_name)
    return elf_main<SPARC64>(argc, argv);
  if (target == M68K::target_name)
    return elf_main<M68K>(argc, argv);
  if (target == SH4::target_name)
    return elf_main<SH4>(argc, argv);
  if (target == ALPHA::target_name)
    return elf_main<ALPHA>(argc, argv);
  if (target == LOONGARCH32::target_name)
    return elf_main<LOONGARCH32>(argc, argv);
  if (target == LOONGARCH64::target_name)
    return elf_main<LOONGARCH64>(argc, argv);
  unreachable();
}

template <typename E>
void apply_exclude_libs(Context<E> &ctx) {
  Timer t(ctx, "apply_exclude_libs");

  if (ctx.arg.exclude_libs.empty())
    return;

  std::unordered_set<std::string_view> set(ctx.arg.exclude_libs.begin(),
                                           ctx.arg.exclude_libs.end());

  for (ObjectFile<E> *file : ctx.objs) {
    if (!file->archive_name.empty())
      if (set.contains("ALL") ||
          set.contains(filepath(file->archive_name).filename().string()))
        file->exclude_libs = true;
  }
}

template <typename E>
static bool has_debug_info_section(Context<E> &ctx) {
  for (ObjectFile<E> *file : ctx.objs)
    if (file->debug_info)
      return true;
  return false;
}

template <typename E>
void create_synthetic_sections(Context<E> &ctx) {
  auto push = [&](auto *x) {
    ctx.chunks.push_back(x);
    ctx.chunk_pool.emplace_back(x);
    return x;
  };

  if (!ctx.arg.oformat_binary) {
    auto find = [&](std::string_view name) {
      for (SectionOrder &ord : ctx.arg.section_order)
        if (ord.type == SectionOrder::SECTION && ord.name == name)
          return true;
      return false;
    };

    if (ctx.arg.section_order.empty() || find("EHDR"))
      ctx.ehdr = push(new OutputEhdr<E>(SHF_ALLOC));
    else
      ctx.ehdr = push(new OutputEhdr<E>(0));

    if (ctx.arg.section_order.empty() || find("PHDR"))
      ctx.phdr = push(new OutputPhdr<E>(SHF_ALLOC));
    else
      ctx.phdr = push(new OutputPhdr<E>(0));

    if (ctx.arg.z_sectionheader)
      ctx.shdr = push(new OutputShdr<E>);
  }

  ctx.got = push(new GotSection<E>);

  if constexpr (!is_sparc<E>)
    ctx.gotplt = push(new GotPltSection<E>(ctx));

  ctx.reldyn = push(new RelDynSection<E>);
  ctx.relplt = push(new RelPltSection<E>);

  if (ctx.arg.pack_dyn_relocs_relr)
    ctx.relrdyn = push(new RelrDynSection<E>);

  ctx.strtab = push(new StrtabSection<E>);
  ctx.plt = push(new PltSection<E>);
  ctx.pltgot = push(new PltGotSection<E>);
  ctx.symtab = push(new SymtabSection<E>);
  ctx.dynsym = push(new DynsymSection<E>);
  ctx.dynstr = push(new DynstrSection<E>);
  ctx.eh_frame = push(new EhFrameSection<E>);
  ctx.copyrel = push(new CopyrelSection<E>(false));
  ctx.copyrel_relro = push(new CopyrelSection<E>(true));

  if (ctx.shdr)
    ctx.shstrtab = push(new ShstrtabSection<E>);

  if (!ctx.arg.dynamic_linker.empty())
    ctx.interp = push(new InterpSection<E>);
  if (ctx.arg.build_id.kind != BuildId::NONE)
    ctx.buildid = push(new BuildIdSection<E>);
  if (ctx.arg.eh_frame_hdr)
    ctx.eh_frame_hdr = push(new EhFrameHdrSection<E>);
  if (ctx.arg.gdb_index && has_debug_info_section(ctx))
    ctx.gdb_index = push(new GdbIndexSection<E>);
  if (ctx.arg.z_relro && ctx.arg.section_order.empty() &&
      ctx.arg.z_separate_code != SEPARATE_LOADABLE_SEGMENTS)
    ctx.relro_padding = push(new RelroPaddingSection<E>);
  if (ctx.arg.hash_style_sysv)
    ctx.hash = push(new HashSection<E>);
  if (ctx.arg.hash_style_gnu)
    ctx.gnu_hash = push(new GnuHashSection<E>);
  if (!ctx.arg.version_definitions.empty())
    ctx.verdef = push(new VerdefSection<E>);
  if (ctx.arg.emit_relocs)
    ctx.eh_frame_reloc = push(new EhFrameRelocSection<E>);

  if (ctx.arg.shared || !ctx.dsos.empty() || ctx.arg.pie) {
    ctx.dynamic = push(new DynamicSection<E>);

    // If .dynamic exists, .dynsym and .dynstr must exist as well
    // since .dynamic refers to them.
    ctx.dynstr->shdr.sh_size = 1;
    ctx.dynsym->symbols.resize(1);
  }

  ctx.versym = push(new VersymSection<E>);
  ctx.verneed = push(new VerneedSection<E>);
  ctx.note_package = push(new NotePackageSection<E>);
  ctx.note_property = push(new NotePropertySection<E>);

  if constexpr (is_riscv<E>)
    ctx.extra.riscv_attributes = push(new RiscvAttributesSection<E>);

  if constexpr (is_ppc64v1<E>)
    ctx.extra.opd = push(new PPC64OpdSection);

  if constexpr (is_ppc64v2<E>)
    ctx.extra.save_restore = push(new PPC64SaveRestoreSection);

  if constexpr (is_sparc<E>) {
    if (ctx.arg.is_static)
      ctx.extra.tls_get_addr_sec = push(new SparcTlsGetAddrSection);
    ctx.extra.tls_get_addr_sym = get_symbol(ctx, "__tls_get_addr");
  }

  if constexpr (is_alpha<E>)
    ctx.extra.got = push(new AlphaGotSection);
}

template <typename E>
static void mark_live_objects(Context<E> &ctx) {
  for (Symbol<E> *sym : ctx.arg.undefined)
    if (sym->file)
      sym->file->is_alive = true;

  for (Symbol<E> *sym : ctx.arg.require_defined)
    if (sym->file)
      sym->file->is_alive = true;

  std::vector<InputFile<E> *> roots;

  for (InputFile<E> *file : ctx.objs)
    if (file->is_alive)
      roots.push_back(file);

  for (InputFile<E> *file : ctx.dsos)
    if (file->is_alive)
      roots.push_back(file);

  tbb::parallel_for_each(roots, [&](InputFile<E> *file,
                                    tbb::feeder<InputFile<E> *> &feeder) {
    if (file->is_alive)
      file->mark_live_objects(ctx, [&](InputFile<E> *obj) { feeder.add(obj); });
  });
}

template <typename E>
void do_resolve_symbols(Context<E> &ctx) {
  auto for_each_file = [&](std::function<void(InputFile<E> *)> fn) {
    tbb::parallel_for_each(ctx.objs, fn);
    tbb::parallel_for_each(ctx.dsos, fn);
  };

  // Due to legacy reasons, archive members will only get included in the final
  // binary if they satisfy one of the undefined symbols in a non-archive object
  // file. This is called archive extraction. In finalize_archive_extraction,
  // this is processed as follows:
  //
  // 1. Do preliminary symbol resolution assuming all archive members
  //    are included. This matches the undefined symbols with ones to be
  //    extracted from archives.
  //
  // 2. Do a mark & sweep pass to eliminate unneeded archive members.
  //
  // Note that the symbol resolution inside finalize_archive_extraction uses a
  // different rule. In order to prevent extracting archive members that can be
  // satisfied by either non-archive object files or DSOs, the archive members
  // are given a lower priority. This is not correct for the general case, where
  // *extracted* object files have precedence over DSOs and even non-archive
  // files that are passed earlier in the command line. Hence, the symbol
  // resolution is thrown away once we determine which archive members to
  // extract, and redone later with the formal rule.
  {
    Timer t(ctx, "extract_archive_members");

    // Register symbols
    for_each_file([&](InputFile<E> *file) { file->resolve_symbols(ctx); });

    // Mark reachable objects to decide which files to include into an output.
    // This also merges symbol visibility.
    mark_live_objects(ctx);

    // Cleanup. The rule used for archive extraction isn't accurate for the
    // general case of symbol extraction, so reset the resolution to be redone
    // later.
    for_each_file([](InputFile<E> *file) { file->clear_symbols(); });

    // Now that the symbol references are gone, remove the eliminated files from
    // the file list.
    std::erase_if(ctx.objs, [](InputFile<E> *file) { return !file->is_alive; });
    std::erase_if(ctx.dsos, [](InputFile<E> *file) { return !file->is_alive; });
  }

  // COMDAT elimination needs to happen exactly here.
  //
  // It needs to be after archive extraction, otherwise we might assign COMDAT
  // leader to an archive member that is not supposed to be extracted.
  //
  // It needs to happen before symbol resolution, otherwise we could eliminate
  // a symbol that is already resolved to and cause dangling references.
  {
    Timer t(ctx, "eliminate_comdats");

    tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
      for (ComdatGroupRef<E> &ref : file->comdat_groups)
        update_minimum(ref.group->owner, file->priority);
    });

    tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
      for (ComdatGroupRef<E> &ref : file->comdat_groups)
        if (ref.group->owner != file->priority)
          for (u32 i : ref.members)
            if (file->sections[i])
              file->sections[i]->kill();
    });
  }

  // Since we have turned on object files live bits, their symbols
  // may now have higher priority than before. So run the symbol
  // resolution pass again to get the final resolution result.
  for_each_file([&](InputFile<E> *file) { file->resolve_symbols(ctx); });
}

template <typename E>
void resolve_symbols(Context<E> &ctx) {
  Timer t(ctx, "resolve_symbols");

  std::vector<ObjectFile<E> *> objs = ctx.objs;
  std::vector<SharedFile<E> *> dsos = ctx.dsos;

  do_resolve_symbols(ctx);

  bool has_lto_obj = false;
  for (ObjectFile<E> *file : objs)
    if (file->is_alive && (file->is_lto_obj || file->is_gcc_offload_obj))
      has_lto_obj = true;

  if (has_lto_obj) {
    // Do link-time optimization. We pass all IR object files to the
    // compiler backend to compile them into a few ELF object files.
    //
    // The compiler backend needs to know how symbols are resolved,
    // so compute symbol visibility, import/export bits, etc early.
    mark_live_objects(ctx);
    apply_version_script(ctx);
    parse_symbol_version(ctx);
    compute_import_export(ctx);

    // Do LTO. It compiles IR object files into a few big ELF files.
    std::vector<ObjectFile<E> *> lto_objs = do_lto(ctx);

    // do_resolve_symbols() have removed unreferenced files. Restore the
    // original files here because some of them may have to be resurrected
    // because they are referenced by the ELF files returned from do_lto().
    ctx.objs = objs;
    ctx.dsos = dsos;

    append(ctx.objs, lto_objs);

    // Redo name resolution from scratch.
    tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
      file->clear_symbols();
    });

    tbb::parallel_for_each(ctx.dsos, [&](SharedFile<E> *file) {
      file->clear_symbols();
    });

    // Remove IR object files.
    for (ObjectFile<E> *file : ctx.objs)
      if (file->is_lto_obj)
        file->is_alive = false;

    std::erase_if(ctx.objs, [](ObjectFile<E> *file) { return file->is_lto_obj; });

    do_resolve_symbols(ctx);
  }
}

// .eh_frame sections are parsed and regenerated by the linker for the purpose
// of deduplication and garbage collection. As such, the input sections should
// not be copied over.
//
// However, in very rare cases (e.g. GCC CRT compiled with LTO) we might need
// to resolve cross-object .eh_frame section references (they only point to
// begin or end and don't depend on the actual section contents).
// Therefore, the sections are "killed" after symbol resolution as a separate
// pass.
template <typename E>
void kill_eh_frame_sections(Context<E> &ctx) {
  Timer t(ctx, "kill_eh_frame_sections");

  for (ObjectFile<E> *file : ctx.objs)
    for (InputSection<E> *sec : file->eh_frame_sections)
      sec->is_alive = false;
}

template <typename E>
void resolve_section_pieces(Context<E> &ctx) {
  Timer t(ctx, "resolve_section_pieces");

  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    file->initialize_mergeable_sections(ctx);
  });

  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    file->resolve_section_pieces(ctx);
  });
}

template <typename E>
void convert_common_symbols(Context<E> &ctx) {
  Timer t(ctx, "convert_common_symbols");

  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    file->convert_common_symbols(ctx);
  });
}

template <typename E>
static std::string get_cmdline_args(Context<E> &ctx) {
  std::stringstream ss;
  ss << ctx.cmdline_args[1];
  for (i64 i = 2; i < ctx.cmdline_args.size(); i++)
    ss << " " << ctx.cmdline_args[i];
  return ss.str();
}

template <typename E>
void add_comment_string(Context<E> &ctx, std::string str) {
  MergedSection<E> *sec =
    MergedSection<E>::get_instance(ctx, ".comment", SHT_PROGBITS,
                                   SHF_MERGE | SHF_STRINGS, 1, 1);

  std::string_view buf = save_string(ctx, str);
  std::string_view data(buf.data(), buf.size() + 1);
  sec->insert(ctx, data, hash_string(data), 0);
}

template <typename E>
void compute_merged_section_sizes(Context<E> &ctx) {
  Timer t(ctx, "compute_merged_section_sizes");

  // Add an identification string to .comment.
  if (!ctx.arg.oformat_binary)
    add_comment_string(ctx, mold_version);

  // Embed command line arguments for debugging.
  if (char *env = getenv("MOLD_DEBUG"); env && env[0])
    add_comment_string(ctx, "mold command line: " + get_cmdline_args(ctx));

  tbb::parallel_for_each(ctx.merged_sections,
                         [&](std::unique_ptr<MergedSection<E>> &sec) {
    sec->assign_offsets(ctx);
  });
}

template <typename T>
static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
  std::span<T> span(input);
  std::vector<std::span<T>> vec;

  while (span.size() >= unit) {
    vec.push_back(span.subspan(0, unit));
    span = span.subspan(unit);
  }
  if (!span.empty())
    vec.push_back(span);
  return vec;
}

template <typename E>
static u64 canonicalize_type(std::string_view name, u64 type) {
  if (type == SHT_PROGBITS) {
    if (name == ".init_array" || name.starts_with(".init_array."))
      return SHT_INIT_ARRAY;
    if (name == ".fini_array" || name.starts_with(".fini_array."))
      return SHT_FINI_ARRAY;
  }

  if constexpr (is_x86_64<E>)
    if (type == SHT_X86_64_UNWIND)
      return SHT_PROGBITS;
  return type;
}

struct OutputSectionKey {
  bool operator==(const OutputSectionKey &) const = default;
  std::string_view name;
  u64 type;
};

template <typename E>
std::string_view
get_output_name(Context<E> &ctx, std::string_view name, u64 flags) {
  if (ctx.arg.relocatable && !ctx.arg.relocatable_merge_sections)
    return name;
  if (ctx.arg.unique && ctx.arg.unique->match(name))
    return name;
  if (flags & SHF_MERGE)
    return name;

  if constexpr (is_arm32<E>) {
    if (name.starts_with(".ARM.exidx"))
      return ".ARM.exidx";
    if (name.starts_with(".ARM.extab"))
      return ".ARM.extab";
  }

  if constexpr (is_alpha<E>) {
    if (name.starts_with(".sdata."))
      return ".sdata";
    if (name.starts_with(".sbss."))
      return ".sbss";
  }

  if (ctx.arg.z_keep_text_section_prefix) {
    static std::string_view prefixes[] = {
      ".text.hot.", ".text.unknown.", ".text.unlikely.", ".text.startup.",
      ".text.exit."
    };

    for (std::string_view prefix : prefixes) {
      std::string_view stem = prefix.substr(0, prefix.size() - 1);
      if (name == stem || name.starts_with(prefix))
        return stem;
    }
  }

  static std::string_view prefixes[] = {
    ".text.", ".data.rel.ro.", ".data.", ".rodata.", ".bss.rel.ro.", ".bss.",
    ".init_array.", ".fini_array.", ".tbss.", ".tdata.", ".gcc_except_table.",
    ".ctors.", ".dtors.", ".gnu.warning.", ".openbsd.randomdata.",
  };

  for (std::string_view prefix : prefixes) {
    std::string_view stem = prefix.substr(0, prefix.size() - 1);
    if (name == stem || name.starts_with(prefix))
      return stem;
  }

  return name;
}

template <typename E>
static OutputSectionKey
get_output_section_key(Context<E> &ctx, InputSection<E> &isec) {
  // If .init_array/.fini_array exist, .ctors/.dtors must be merged
  // with them.
  //
  // CRT object files contain .ctors/.dtors sections without any
  // relocations. They contain sentinel values, 0 and -1, to mark the
  // beginning and the end of the initializer/finalizer pointer arrays.
  // We do not place them into .init_array/.fini_array because such
  // invalid pointer values would simply make the program to crash.
  if (ctx.has_init_array && !isec.get_rels(ctx).empty()) {
    std::string_view name = isec.name();
    if (name == ".ctors" || name.starts_with(".ctors."))
      return {".init_array", SHT_INIT_ARRAY};
    if (name == ".dtors" || name.starts_with(".dtors."))
      return {".fini_array", SHT_FINI_ARRAY};
  }

  const ElfShdr<E> &shdr = isec.shdr();
  std::string_view name = get_output_name(ctx, isec.name(), shdr.sh_flags);
  u64 type = canonicalize_type<E>(name, shdr.sh_type);
  return {name, type};
}

// Create output sections for input sections.
template <typename E>
void create_output_sections(Context<E> &ctx) {
  Timer t(ctx, "create_output_sections");

  struct Hash {
    size_t operator()(const OutputSectionKey &k) const {
      return combine_hash(hash_string(k.name), std::hash<u64>{}(k.type));
    }
  };

  std::unordered_map<OutputSectionKey, OutputSection<E> *, Hash> map;
  std::shared_mutex mu;

  i64 size = ctx.osec_pool.size();

  // Instantiate output sections
  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    // Make a per-thread cache of the main map to avoid lock contention.
    // It makes a noticeable difference if we have millions of input sections.
    decltype(map) cache;
    {
      std::shared_lock lock(mu);
      cache = map;
    }

    for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
      if (!isec || !isec->is_alive)
        continue;

      const ElfShdr<E> &shdr = isec->shdr();
      u32 sh_flags = shdr.sh_flags & ~SHF_MERGE & ~SHF_STRINGS &
                     ~SHF_COMPRESSED & ~SHF_GNU_RETAIN;

      if (ctx.arg.relocatable && (sh_flags & SHF_GROUP)) {
        OutputSection<E> *osec = new OutputSection<E>(isec->name(), shdr.sh_type);
        osec->sh_flags = sh_flags;
        isec->output_section = osec;
        ctx.osec_pool.emplace_back(osec);
        continue;
      }

      OutputSectionKey key = get_output_section_key(ctx, *isec);

      if (auto it = cache.find(key); it != cache.end()) {
        isec->output_section = it->second;
        continue;
      }

      auto get_or_insert = [&] {
        {
          std::shared_lock lock(mu);
          if (auto it = map.find(key); it != map.end())
            return it->second;
        }

        std::unique_ptr<OutputSection<E>> osec =
          std::make_unique<OutputSection<E>>(key.name, key.type);

        std::unique_lock lock(mu);
        auto [it, inserted] = map.insert({key, osec.get()});
        OutputSection<E> *ret = it->second;

        if (inserted)
          ctx.osec_pool.emplace_back(std::move(osec));
        return ret;
      };

      OutputSection<E> *osec = get_or_insert();
      osec->sh_flags |= sh_flags & ~SHF_GROUP;
      isec->output_section = osec;
      cache.insert({key, osec});
    }
  });

  for (std::unique_ptr<OutputSection<E>> &osec : ctx.osec_pool) {
    osec->shdr.sh_flags = osec->sh_flags;

    // Handle --section-align
    if (!ctx.arg.section_align.empty())
      if (auto it = ctx.arg.section_align.find(osec->name);
          it != ctx.arg.section_align.end())
        osec->shdr.sh_addralign = it->second;

    // PT_GNU_RELRO segment is a security mechanism to make more pages
    // read-only than we could have done without it.
    //
    // Traditionally, sections are either read-only or read-write. If a
    // section contains dynamic relocations, it must have been put into a
    // read-write segment so that the program loader can mutate its
    // contents in memory, even if no one will write to it at runtime.
    //
    // RELRO segment allows us to make such pages writable only when a
    // program is being loaded. After that, the page becomes read-only.
    //
    // Some sections, such as .init, .fini, .got, .dynamic, contain
    // dynamic relocations but doesn't have to be writable at runtime,
    // so they are put into a RELRO segment.
    u32 type = osec->shdr.sh_type;
    u32 flags = osec->shdr.sh_flags;
    osec->is_relro = (osec->name == ".toc" || osec->name.ends_with(".rel.ro") ||
                      type == SHT_INIT_ARRAY || type == SHT_FINI_ARRAY ||
                      type == SHT_PREINIT_ARRAY || (flags & SHF_TLS));
  }

  // Add input sections to output sections
  std::vector<Chunk<E> *> chunks;
  for (i64 i = size; i < ctx.osec_pool.size(); i++)
    chunks.push_back(ctx.osec_pool[i].get());

  for (ObjectFile<E> *file : ctx.objs)
    for (std::unique_ptr<InputSection<E>> &isec : file->sections)
      if (isec && isec->is_alive)
        isec->output_section->members.push_back(isec.get());

  // Add output sections and mergeable sections to ctx.chunks
  for (std::unique_ptr<MergedSection<E>> &osec : ctx.merged_sections)
    if (osec->shdr.sh_size)
      chunks.push_back(osec.get());

  // Sections are added to the section lists in an arbitrary order
  // because they are created in parallel. Sort them to to make the
  // output deterministic.
  tbb::parallel_sort(chunks.begin(), chunks.end(), [](Chunk<E> *x, Chunk<E> *y) {
    return std::tuple(x->name, x->shdr.sh_type, x->shdr.sh_flags) <
           std::tuple(y->name, y->shdr.sh_type, y->shdr.sh_flags);
  });

  append(ctx.chunks, chunks);
}

// Create a dummy object file containing linker-synthesized
// symbols.
template <typename E>
void create_internal_file(Context<E> &ctx) {
  ObjectFile<E> *obj = new ObjectFile<E>;
  ctx.obj_pool.emplace_back(obj);
  ctx.internal_obj = obj;
  ctx.objs.push_back(obj);

  // Create linker-synthesized symbols.
  ctx.internal_esyms.resize(1);

  obj->symbols.push_back(new Symbol<E>);
  obj->first_global = 1;
  obj->is_alive = true;
  obj->priority = 1;

  auto add = [&](Symbol<E> *sym) {
    obj->symbols.push_back(sym);

    // An actual value will be set to a linker-synthesized symbol by
    // fix_synthetic_symbols(). Until then, `value` doesn't have a valid
    // value. 0xdeadbeef is a unique dummy value to make debugging easier
    // if the field is accidentally used before it gets a valid one.
    sym->value = 0xdeadbeef;

    ElfSym<E> esym;
    memset(&esym, 0, sizeof(esym));
    esym.st_type = STT_NOTYPE;
    esym.st_shndx = SHN_ABS;
    esym.st_bind = STB_GLOBAL;
    esym.st_visibility = STV_DEFAULT;
    ctx.internal_esyms.push_back(esym);
  };

  // Add --defsym'd symbols
  for (i64 i = 0; i < ctx.arg.defsyms.size(); i++)
    add(ctx.arg.defsyms[i].first);

  // Add --section-order symbols
  for (SectionOrder &ord : ctx.arg.section_order)
    if (ord.type == SectionOrder::SYMBOL)
      add(get_symbol(ctx, ord.name));

  obj->elf_syms = ctx.internal_esyms;
  obj->has_symver.resize(ctx.internal_esyms.size() - 1);
}

template <typename E>
static std::optional<std::string>
get_start_stop_name(Context<E> &ctx, Chunk<E> &chunk) {
  if ((chunk.shdr.sh_flags & SHF_ALLOC) && !chunk.name.empty()) {
    if (is_c_identifier(chunk.name))
      return std::string(chunk.name);

    if (ctx.arg.start_stop) {
      auto isalnum = [](char c) {
        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
               ('0' <= c && c <= '9');
      };

      std::string s{chunk.name};
      if (s.starts_with('.'))
        s = s.substr(1);

      for (i64 i = 0; i < s.size(); i++)
        if (!isalnum(s[i]))
          s[i] = '_';
      return s;
    }
  }

  return {};
}

template <typename E>
void add_synthetic_symbols(Context<E> &ctx) {
  ObjectFile<E> &obj = *ctx.internal_obj;

  auto add = [&](std::string_view name, u32 type = STT_NOTYPE) {
    ElfSym<E> esym;
    memset(&esym, 0, sizeof(esym));
    esym.st_type = type;
    esym.st_shndx = SHN_ABS;
    esym.st_bind = STB_GLOBAL;
    esym.st_visibility = STV_HIDDEN;
    ctx.internal_esyms.push_back(esym);

    Symbol<E> *sym = get_symbol(ctx, name);
    sym->value = 0xdeadbeef; // unique dummy value
    obj.symbols.push_back(sym);
    return sym;
  };

  ctx.__ehdr_start = add("__ehdr_start");
  ctx.__init_array_start = add("__init_array_start");
  ctx.__init_array_end = add("__init_array_end");
  ctx.__fini_array_start = add("__fini_array_start");
  ctx.__fini_array_end = add("__fini_array_end");
  ctx.__preinit_array_start = add("__preinit_array_start");
  ctx.__preinit_array_end = add("__preinit_array_end");
  ctx._DYNAMIC = add("_DYNAMIC");
  ctx._GLOBAL_OFFSET_TABLE_ = add("_GLOBAL_OFFSET_TABLE_");
  ctx._PROCEDURE_LINKAGE_TABLE_ = add("_PROCEDURE_LINKAGE_TABLE_");
  ctx.__bss_start = add("__bss_start");
  ctx._end = add("_end");
  ctx._etext = add("_etext");
  ctx._edata = add("_edata");
  ctx.__executable_start = add("__executable_start");

  ctx.__rel_iplt_start =
    add(E::is_rela ? "__rela_iplt_start" : "__rel_iplt_start");
  ctx.__rel_iplt_end =
    add(E::is_rela ? "__rela_iplt_end" : "__rel_iplt_end");

  if (ctx.arg.eh_frame_hdr)
    ctx.__GNU_EH_FRAME_HDR = add("__GNU_EH_FRAME_HDR");

  if (!get_symbol(ctx, "end")->file)
    ctx.end = add("end");
  if (!get_symbol(ctx, "etext")->file)
    ctx.etext = add("etext");
  if (!get_symbol(ctx, "edata")->file)
    ctx.edata = add("edata");
  if (!get_symbol(ctx, "__dso_handle")->file)
    ctx.__dso_handle = add("__dso_handle");

  if constexpr (supports_tlsdesc<E>)
    ctx._TLS_MODULE_BASE_ = add("_TLS_MODULE_BASE_", STT_TLS);

  if constexpr (is_riscv<E>) {
    ctx.__global_pointer = add("__global_pointer$");
    if (ctx.dynamic && !ctx.arg.shared)
      ctx.__global_pointer->is_exported = true;
  }

  if constexpr (is_arm32<E>) {
    ctx.__exidx_start = add("__exidx_start");
    ctx.__exidx_end = add("__exidx_end");
  }

  if constexpr (is_ppc64<E>)
    ctx.extra.TOC = add(".TOC.");

  if constexpr (is_ppc32<E>)
    ctx.extra._SDA_BASE_ = add("_SDA_BASE_");

  for (Chunk<E> *chunk : ctx.chunks) {
    if (std::optional<std::string> name = get_start_stop_name(ctx, *chunk)) {
      add(save_string(ctx, "__start_" + *name));
      add(save_string(ctx, "__stop_" + *name));

      if (ctx.arg.physical_image_base) {
        add(save_string(ctx, "__phys_start_" + *name));
        add(save_string(ctx, "__phys_stop_" + *name));
      }
    }
  }

  if constexpr (is_ppc64v2<E>)
    for (auto [label, insn] : ppc64_save_restore_insns)
      if (!label.empty())
        add(label);

  obj.elf_syms = ctx.internal_esyms;
  obj.has_symver.resize(ctx.internal_esyms.size() - 1);

  obj.resolve_symbols(ctx);

  // Make all synthetic symbols relative ones by associating them to
  // a dummy output section.
  for (Symbol<E> *sym : obj.symbols)
    if (sym->file == &obj)
      sym->set_output_section(ctx.symtab);

  // Handle --defsym symbols.
  for (i64 i = 0; i < ctx.arg.defsyms.size(); i++) {
    Symbol<E> *sym1 = ctx.arg.defsyms[i].first;
    std::variant<Symbol<E> *, u64> val = ctx.arg.defsyms[i].second;

    if (Symbol<E> **ref = std::get_if<Symbol<E> *>(&val)) {
      Symbol<E> *sym2 = *ref;
      if (!sym2->file) {
        Error(ctx) << "--defsym: undefined symbol: " << *sym2;
        continue;
      }

      ElfSym<E> &esym = obj.elf_syms[i + 1];
      esym.st_type = sym2->esym().st_type;
      if constexpr (is_ppc64v2<E>)
        esym.ppc_local_entry = sym2->esym().ppc_local_entry;

      if (sym2->is_absolute())
        sym1->origin = 0;
    } else {
      sym1->origin = 0;
    }
  }
}

template <typename E>
void check_cet_errors(Context<E> &ctx) {
  bool warning = (ctx.arg.z_cet_report == CET_REPORT_WARNING);
  assert(warning || ctx.arg.z_cet_report == CET_REPORT_ERROR);

  auto has_feature = [](ObjectFile<E> *file, u32 feature) {
    return std::any_of(file->gnu_properties.begin(), file->gnu_properties.end(),
                       [&](std::pair<u32, u32> kv) {
      return kv.first == GNU_PROPERTY_X86_FEATURE_1_AND &&
             (kv.second & feature);
    });
  };

  for (ObjectFile<E> *file : ctx.objs) {
    if (file == ctx.internal_obj)
      continue;

    if (!has_feature(file, GNU_PROPERTY_X86_FEATURE_1_IBT)) {
      if (warning)
        Warn(ctx) << *file << ": -cet-report=warning: "
                  << "missing GNU_PROPERTY_X86_FEATURE_1_IBT";
      else
        Error(ctx) << *file << ": -cet-report=error: "
                   << "missing GNU_PROPERTY_X86_FEATURE_1_IBT";
    }

    if (!has_feature(file, GNU_PROPERTY_X86_FEATURE_1_SHSTK)) {
      if (warning)
        Warn(ctx) << *file << ": -cet-report=warning: "
                  << "missing GNU_PROPERTY_X86_FEATURE_1_SHSTK";
      else
        Error(ctx) << *file << ": -cet-report=error: "
                   << "missing GNU_PROPERTY_X86_FEATURE_1_SHSTK";
    }
  }
}

template <typename E>
void print_dependencies(Context<E> &ctx) {
  SyncOut(ctx) <<
R"(# This is an output of the mold linker's --print-dependencies option.
#
# Each line consists of 4 fields, <section1>, <section2>, <symbol-type> and
# <symbol>, separated by tab characters. It indicates that <section1> depends
# on <section2> to use <symbol>. <symbol-type> is either "u" or "w" for
# regular undefined or weak undefined, respectively.
#
# If you want to obtain dependency information per function granularity,
# compile source files with the -ffunction-sections compiler flag.
)";

  auto println = [&](auto &src, Symbol<E> &sym, ElfSym<E> &esym) {
    if (InputSection<E> *isec = sym.get_input_section())
      SyncOut(ctx) << src << "\t" << *isec
                   << "\t" << (esym.is_weak() ? 'w' : 'u')
                   << "\t" << sym;
    else
      SyncOut(ctx) << src << "\t" << *sym.file
                   << "\t" << (esym.is_weak() ? 'w' : 'u')
                   << "\t" << sym;
  };

  for (ObjectFile<E> *file : ctx.objs) {
    for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
      if (!isec)
        continue;

      std::unordered_set<void *> visited;

      for (const ElfRel<E> &r : isec->get_rels(ctx)) {
        if (r.r_type == R_NONE)
          continue;

        ElfSym<E> &esym = file->elf_syms[r.r_sym];
        Symbol<E> &sym = *file->symbols[r.r_sym];

        if (esym.is_undef() && sym.file && sym.file != file &&
            visited.insert((void *)&sym).second)
          println(*isec, sym, esym);
      }
    }
  }

  for (SharedFile<E> *file : ctx.dsos) {
    for (i64 i = file->first_global; i < file->symbols.size(); i++) {
      ElfSym<E> &esym = file->elf_syms[i];
      Symbol<E> &sym = *file->symbols[i];
      if (esym.is_undef() && sym.file && sym.file != file)
        println(*file, sym, esym);
    }
  }
}

template <typename E>
static std::string create_response_file(Context<E> &ctx) {
  std::string buf;
  std::stringstream out;

  std::string cwd = std::filesystem::current_path().string();
  out << "-C " << cwd.substr(1) << "\n";

  if (cwd != "/") {
    out << "--chroot ..";
    i64 depth = std::count(cwd.begin(), cwd.end(), '/');
    for (i64 i = 1; i < depth; i++)
      out << "/..";
    out << "\n";
  }

  for (i64 i = 1; i < ctx.cmdline_args.size(); i++) {
    std::string_view arg = ctx.cmdline_args[i];
    if (arg != "-repro" && arg != "--repro")
      out << arg << "\n";
  }
  return out.str();
}

template <typename E>
void write_repro_file(Context<E> &ctx) {
  std::string path = ctx.arg.output + ".repro.tar";

  std::unique_ptr<TarWriter> tar =
    TarWriter::open(path, filepath(ctx.arg.output).filename().string() + ".repro");
  if (!tar)
    Fatal(ctx) << "cannot open " << path << ": " << errno_string();

  tar->append("response.txt", save_string(ctx, create_response_file(ctx)));
  tar->append("version.txt", save_string(ctx, mold_version + "\n"));

  std::unordered_set<std::string_view> seen;

  for (std::unique_ptr<MappedFile> &mf : ctx.mf_pool) {
    if (!mf->parent && seen.insert(mf->name).second) {
      // We reopen a file because we may have modified the contents of mf
      // in memory, which is mapped with PROT_WRITE and MAP_PRIVATE.
      MappedFile *mf2 = must_open_file(ctx, mf->name);
      tar->append(to_abs_path(mf->name).string(), mf2->get_contents());
      mf2->unmap();
    }
  }
}

template <typename E>
void check_duplicate_symbols(Context<E> &ctx) {
  Timer t(ctx, "check_duplicate_symbols");

  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    for (i64 i = file->first_global; i < file->elf_syms.size(); i++) {
      const ElfSym<E> &esym = file->elf_syms[i];
      Symbol<E> &sym = *file->symbols[i];

      // Skip if our symbol is undef or weak
      if (sym.file == file || sym.file == ctx.internal_obj ||
          esym.is_undef() || esym.is_common() || (esym.st_bind == STB_WEAK))
        continue;

      // Skip if our symbol is in a dead section. In most cases, the
      // section has been eliminated due to comdat deduplication.
      if (!esym.is_abs()) {
        InputSection<E> *isec = file->get_section(esym);
        if (!isec || !isec->is_alive)
          continue;
      }

      Error(ctx) << "duplicate symbol: " << *file << ": " << *sym.file
                 << ": " << sym;
    }
  });

  ctx.checkpoint();
}

template <typename E>
void check_symbol_types(Context<E> &ctx) {
  Timer t(ctx, "check_symbol_types");

  std::vector<InputFile<E> *> files;
  append(files, ctx.objs);
  append(files, ctx.dsos);

  tbb::parallel_for_each(files.begin(), files.end(), [&](InputFile<E> *file) {
    for (i64 i = file->first_global; i < file->elf_syms.size(); i++) {
      Symbol<E> &sym = *file->symbols[i];
      if (!sym.file || sym.file == file)
        continue;

      const ElfSym<E> &esym1 = sym.esym();
      const ElfSym<E> &esym2 = file->elf_syms[i];

      u32 ty1 = (esym1.st_type == STT_GNU_IFUNC) ? (u32)STT_FUNC : esym1.st_type;
      u32 ty2 = (esym2.st_type == STT_GNU_IFUNC) ? (u32)STT_FUNC : esym2.st_type;

      if (ty1 != STT_NOTYPE && ty2 != STT_NOTYPE && ty1 != ty2)
        Warn(ctx) << "symbol type mismatch: " << sym << '\n'
                  << ">>> defined in " << *sym.file << " as "
                  << stt_to_string<E>(esym1.st_type) << '\n'
                  << ">>> defined in " << *file << " as "
                  << stt_to_string<E>(esym2.st_type);
    }
  });
}

template <typename E>
static i64 get_init_fini_priority(InputSection<E> *isec) {
  static std::regex re(R"(\.(\d+)$)", std::regex_constants::optimize);
  std::string_view name = isec->name();
  std::cmatch m;
  if (std::regex_search(name.data(), name.data() + name.size(), m, re))
    return std::stoi(m[1]);
  return 65536;
}

template <typename E>
static i64 get_ctor_dtor_priority(InputSection<E> *isec) {
  auto opts = std::regex_constants::optimize | std::regex_constants::ECMAScript;
  static std::regex re1(R"((?:clang_rt\.)?crtbegin)", opts);
  static std::regex re2(R"((?:clang_rt\.)?crtend)", opts);
  static std::regex re3(R"(\.(\d+)$)", opts);

  // crtbegin.o and crtend.o contain marker symbols such as
  // __CTOR_LIST__ or __DTOR_LIST__. So they have to be at the
  // beginning or end of the section.
  std::smatch m;
  if (std::regex_search(isec->file.filename, m, re1))
    return -2;
  if (std::regex_search(isec->file.filename, m, re2))
    return 65536;

  std::string name(isec->name());
  if (std::regex_search(name, m, re3))
    return std::stoi(m[1]);
  return -1;
}

template <typename E>
void sort_init_fini(Context<E> &ctx) {
  Timer t(ctx, "sort_init_fini");

  for (Chunk<E> *chunk : ctx.chunks) {
    if (OutputSection<E> *osec = chunk->to_osec()) {
      if (osec->name == ".init_array" || osec->name == ".preinit_array" ||
          osec->name == ".fini_array") {
        if (ctx.arg.shuffle_sections == SHUFFLE_SECTIONS_REVERSE)
          std::reverse(osec->members.begin(), osec->members.end());

        std::unordered_map<InputSection<E> *, i64> map;

        for (InputSection<E> *isec : osec->members) {
          std::string_view name = isec->name();
          if (name.starts_with(".ctors") || name.starts_with(".dtors"))
            map.insert({isec, 65535 - get_ctor_dtor_priority(isec)});
          else
            map.insert({isec, get_init_fini_priority(isec)});
        }

        sort(osec->members, [&](InputSection<E> *a, InputSection<E> *b) {
          return map[a] < map[b];
        });
      }
    }
  }
}

template <typename E>
void sort_ctor_dtor(Context<E> &ctx) {
  Timer t(ctx, "sort_ctor_dtor");

  for (Chunk<E> *chunk : ctx.chunks) {
    if (OutputSection<E> *osec = chunk->to_osec()) {
      if (osec->name == ".ctors" || osec->name == ".dtors") {
        if (ctx.arg.shuffle_sections != SHUFFLE_SECTIONS_REVERSE)
          std::reverse(osec->members.begin(), osec->members.end());

        std::unordered_map<InputSection<E> *, i64> map;
        for (InputSection<E> *isec : osec->members)
          map.insert({isec, get_ctor_dtor_priority(isec)});

        sort(osec->members, [&](InputSection<E> *a, InputSection<E> *b) {
          return map[a] < map[b];
        });
      }
    }
  }
}

// .ctors/.dtors serves the same purpose as .init_array/.fini_array,
// albeit with very subtly differences. Both contain pointers to
// initializer/finalizer functions. The runtime executes them one by one
// but in the exact opposite order to one another. Therefore, if we are to
// place the contents of .ctors/.dtors into .init_array/.fini_array, we
// need to reverse them.
//
// It's unfortunate that we have both .ctors/.dtors and
// .init_array/.fini_array in ELF for historical reasons, but that's
// the reality we need to deal with.
template <typename E>
void fixup_ctors_in_init_array(Context<E> &ctx) {
  Timer t(ctx, "fixup_ctors_in_init_array");

  for (Chunk<E> *chunk : ctx.chunks) {
    if (OutputSection<E> *osec = chunk->to_osec()) {
      if (osec->name == ".init_array" || osec->name == ".fini_array") {
        for (InputSection<E> *isec : osec->members) {
          if (isec->name().starts_with(".ctors") ||
              isec->name().starts_with(".dtors")) {
            if (isec->sh_size % sizeof(Word<E>)) {
              Error(ctx) << *isec << ": section corrupted";
              continue;
            }

            u8 *buf = (u8 *)isec->contents.data();
            std::reverse((Word<E> *)buf, (Word<E> *)(buf + isec->sh_size));

            std::span<ElfRel<E>> rels = isec->get_rels(ctx);
            for (ElfRel<E> &r : rels)
              r.r_offset = isec->sh_size - r.r_offset - sizeof(Word<E>);
            std::reverse(rels.begin(), rels.end());
          }
        }
      }
    }
  }
}

template <typename T>
static void shuffle(std::vector<T> &vec, u64 seed) {
  if (vec.empty())
    return;

  // Xorshift random number generator. We use this RNG because it is
  // measurably faster than MT19937.
  auto rand = [&] {
    seed ^= seed << 13;
    seed ^= seed >> 7;
    seed ^= seed << 17;
    return seed;
  };

  // The Fisher-Yates shuffling algorithm.
  //
  // We don't want to use std::shuffle for build reproducibility. That is,
  // std::shuffle's implementation is not guaranteed to be the same across
  // platform, so even though the result is guaranteed to be randomly
  // shuffled, the exact order may be different across implementations.
  //
  // We are not using std::uniform_int_distribution for the same reason.
  for (i64 i = 0; i < vec.size() - 1; i++)
    std::swap(vec[i], vec[i + rand() % (vec.size() - i)]);
}

template <typename E>
void shuffle_sections(Context<E> &ctx) {
  Timer t(ctx, "shuffle_sections");

  auto is_eligible = [](OutputSection<E> *osec) {
    if (osec) {
      std::string_view name = osec->name;
      return name != ".init" && name != ".fini" &&
             name != ".ctors" && name != ".dtors" &&
             name != ".init_array" && name != ".preinit_array" &&
             name != ".fini_array";
    }
    return false;
  };

  switch (ctx.arg.shuffle_sections) {
  case SHUFFLE_SECTIONS_SHUFFLE: {
    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
      if (OutputSection<E> *osec = chunk->to_osec(); is_eligible(osec)) {
        u64 seed = ctx.arg.shuffle_sections_seed + hash_string(osec->name);
        shuffle(osec->members, seed);
      }
    });
    break;
  }
  case SHUFFLE_SECTIONS_REVERSE:
    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
      if (OutputSection<E> *osec = chunk->to_osec(); is_eligible(osec))
        std::reverse(osec->members.begin(), osec->members.end());
    });
    break;
  default:
    unreachable();
  }
}

template <typename E>
void compute_section_sizes(Context<E> &ctx) {
  Timer t(ctx, "compute_section_sizes");

  struct Group {
    i64 size = 0;
    i64 p2align = 0;
    i64 offset = 0;
    std::span<InputSection<E> *> members;
  };

  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
    OutputSection<E> *osec = chunk->to_osec();
    if (!osec)
      return;

    // This pattern will be processed in the next loop.
    if constexpr (needs_thunk<E>)
      if ((osec->shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable)
        return;

    // Since one output section may contain millions of input sections,
    // we first split input sections into groups and assign offsets to
    // groups.
    std::vector<Group> groups;
    constexpr i64 group_size = 10000;

    for (std::span<InputSection<E> *> span : split(osec->members, group_size))
      groups.push_back(Group{.members = span});

    tbb::parallel_for_each(groups, [](Group &group) {
      for (InputSection<E> *isec : group.members) {
        group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
        group.p2align = std::max<i64>(group.p2align, isec->p2align);
      }
    });

    ElfShdr<E> &shdr = osec->shdr;
    shdr.sh_size = 0;

    for (i64 i = 0; i < groups.size(); i++) {
      shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
      groups[i].offset = shdr.sh_size;
      shdr.sh_size += groups[i].size;
      shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
    }

    // Assign offsets to input sections.
    tbb::parallel_for_each(groups, [](Group &group) {
      i64 offset = group.offset;
      for (InputSection<E> *isec : group.members) {
        offset = align_to(offset, 1 << isec->p2align);
        isec->offset = offset;
        offset += isec->sh_size;
      }
    });
  });

  // On ARM32 or ARM64, we may need to create so-called "range extension
  // thunks" to extend branch instructions reach, as they can jump only
  // to ±16 MiB or ±128 MiB, respecitvely.
  //
  // In the following loop, We compute the sizes of sections while
  // inserting thunks. This pass cannot be parallelized. That is,
  // create_range_extension_thunks is parallelized internally, but the
  // function itself is not thread-safe.
  if constexpr (needs_thunk<E>) {
    Timer t2(ctx, "create_range_extension_thunks");

    if (!ctx.arg.relocatable)
      for (Chunk<E> *chunk : ctx.chunks)
        if (OutputSection<E> *osec = chunk->to_osec())
          if (osec->shdr.sh_flags & SHF_EXECINSTR)
            osec->create_range_extension_thunks(ctx);
  }
}

// Find all unresolved symbols and attach them to the most appropriate files.
// Note that even a symbol that will be reported as an undefined symbol will
// get an owner file in this function. Such symbol will be reported by
// ObjectFile<E>::scan_relocations().
template <typename E>
void claim_unresolved_symbols(Context<E> &ctx) {
  Timer t(ctx, "claim_unresolved_symbols");

  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    if (!file->is_alive)
      return;

    for (i64 i = file->first_global; i < file->elf_syms.size(); i++) {
      const ElfSym<E> &esym = file->elf_syms[i];
      Symbol<E> &sym = *file->symbols[i];
      if (!esym.is_undef())
        continue;

      std::scoped_lock lock(sym.mu);

      if (sym.file)
        if (!sym.esym().is_undef() || sym.file->priority <= file->priority)
          continue;

      // If a symbol name is in the form of "foo@version", search for
      // symbol "foo" and check if the symbol has version "version".
      if (file->has_symver.get(i - file->first_global)) {
        std::string_view str = file->symbol_strtab.data() + esym.st_name;
        i64 pos = str.find('@');
        assert(pos != str.npos);

        std::string_view name = str.substr(0, pos);
        std::string_view ver = str.substr(pos + 1);

        Symbol<E> *sym2 = get_symbol(ctx, name);
        if (sym2->file && sym2->file->is_dso && sym2->get_version() == ver) {
          file->symbols[i] = sym2;
          sym2->is_imported = true;
          continue;
        }
      }

      auto claim = [&](bool is_imported) {
        if (sym.is_traced)
          SyncOut(ctx) << "trace-symbol: " << *file << ": unresolved"
                       << (esym.is_weak() ? " weak" : "")
                       << " symbol " << sym;

        sym.file = file;
        sym.origin = 0;
        sym.value = 0;
        sym.sym_idx = i;
        sym.is_weak = false;
        sym.is_imported = is_imported;
        sym.is_exported = false;
        sym.ver_idx = is_imported ? 0 : ctx.default_version;
      };

      if (esym.is_undef_weak()) {
        if (ctx.arg.shared && sym.visibility != STV_HIDDEN &&
            ctx.arg.z_dynamic_undefined_weak) {
          // Global weak undefined symbols are promoted to dynamic symbols
          // when linking a DSO unless `-z nodynamic_undefined_weak` was given.
          claim(true);
        } else {
          // Otherwise, weak undefs are converted to absolute symbols with value 0.
          claim(false);
        }
        continue;
      }

      // Traditionally, remaining undefined symbols cause a link failure
      // only when we are creating an executable. Undefined symbols in
      // shared objects are promoted to dynamic symbols, so that they'll
      // get another chance to be resolved at run-time. You can change the
      // behavior by passing `-z defs` to the linker.
      //
      // Even if `-z defs` is given, weak undefined symbols are still
      // promoted to dynamic symbols for compatibility with other linkers.
      // Some major programs, notably Firefox, depend on the behavior
      // (they use this loophole to export symbols from libxul.so).
      if (ctx.arg.shared && sym.visibility != STV_HIDDEN && !ctx.arg.z_defs) {
        claim(true);
        continue;
      }

      // Convert remaining undefined symbols to absolute symbols with value 0.
      claim(false);
    }
  });
}

template <typename E>
void scan_relocations(Context<E> &ctx) {
  Timer t(ctx, "scan_relocations");

  // Scan relocations to find dynamic symbols.
  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    file->scan_relocations(ctx);
  });

  // Exit if there was a relocation that refers an undefined symbol.
  ctx.checkpoint();

  // Aggregate dynamic symbols to a single vector.
  std::vector<InputFile<E> *> files;
  append(files, ctx.objs);
  append(files, ctx.dsos);

  std::vector<std::vector<Symbol<E> *>> vec(files.size());

  tbb::parallel_for((i64)0, (i64)files.size(), [&](i64 i) {
    for (Symbol<E> *sym : files[i]->symbols)
      if (sym->file == files[i])
        if (sym->flags || sym->is_imported || sym->is_exported)
          vec[i].push_back(sym);
  });

  std::vector<Symbol<E> *> syms = flatten(vec);
  ctx.symbol_aux.reserve(syms.size());

  if (ctx.needs_tlsld)
    ctx.got->add_tlsld(ctx);

  // Assign offsets in additional tables for each dynamic symbol.
  for (Symbol<E> *sym : syms) {
    sym->add_aux(ctx);

    if (sym->is_imported || sym->is_exported)
      ctx.dynsym->add_symbol(ctx, sym);

    if (sym->flags & NEEDS_GOT)
      ctx.got->add_got_symbol(ctx, sym);

    if (sym->flags & NEEDS_CPLT) {
      sym->is_canonical = true;

      // A canonical PLT needs to be visible from DSOs.
      sym->is_exported = true;

      // We can't use .plt.got for a canonical PLT because otherwise
      // .plt.got and .got would refer to each other, resulting in an
      // infinite loop at runtime.
      ctx.plt->add_symbol(ctx, sym);
    } else if (sym->flags & NEEDS_PLT) {
      if (sym->flags & NEEDS_GOT)
        ctx.pltgot->add_symbol(ctx, sym);
      else
        ctx.plt->add_symbol(ctx, sym);
    }

    if (sym->flags & NEEDS_GOTTP)
      ctx.got->add_gottp_symbol(ctx, sym);

    if (sym->flags & NEEDS_TLSGD)
      ctx.got->add_tlsgd_symbol(ctx, sym);

    if (sym->flags & NEEDS_TLSDESC)
      ctx.got->add_tlsdesc_symbol(ctx, sym);

    if (sym->flags & NEEDS_COPYREL) {
      if (((SharedFile<E> *)sym->file)->is_readonly(sym))
        ctx.copyrel_relro->add_symbol(ctx, sym);
      else
        ctx.copyrel->add_symbol(ctx, sym);
    }

    if constexpr (is_ppc64v1<E>)
      if (sym->flags & NEEDS_PPC_OPD)
        ctx.extra.opd->add_symbol(ctx, sym);

    sym->flags = 0;
  }

  if constexpr (is_alpha<E>)
    ctx.extra.got->finalize();

  if (ctx.has_textrel && ctx.arg.warn_textrel)
    Warn(ctx) << "creating a DT_TEXTREL in an output file";
}

// Compute the is_weak bit for each imported symbol.
//
// If all references to a shared symbol is weak, the symbol is marked
// as weak in .dynsym.
template <typename E>
void compute_imported_symbol_weakness(Context<E> &ctx) {
  Timer t(ctx, "compute_imported_symbol_weakness");

  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    for (i64 i = file->first_global; i < file->elf_syms.size(); i++) {
      const ElfSym<E> &esym = file->elf_syms[i];
      Symbol<E> &sym = *file->symbols[i];

      if (esym.is_undef() && !esym.is_weak() && sym.file && sym.file->is_dso) {
        std::scoped_lock lock(sym.mu);
        sym.is_weak = false;
      }
    }
  });
}

// Report all undefined symbols, grouped by symbol.
template <typename E>
void report_undef_errors(Context<E> &ctx) {
  constexpr i64 max_errors = 3;

  for (auto &pair : ctx.undef_errors) {
    Symbol<E> *sym = pair.first;
    std::span<std::string> errors = pair.second;

    std::stringstream ss;
    ss << "undefined symbol: "
       << (ctx.arg.demangle ? demangle(*sym) : sym->name())
       << "\n";

    for (i64 i = 0; i < errors.size() && i < max_errors; i++)
      ss << errors[i];

    if (errors.size() > max_errors)
      ss << ">>> referenced " << (errors.size() - max_errors) << " more times\n";

    if (ctx.arg.unresolved_symbols == UNRESOLVED_ERROR)
      Error(ctx) << ss.str();
    else if (ctx.arg.unresolved_symbols == UNRESOLVED_WARN)
      Warn(ctx) << ss.str();
  }

  ctx.checkpoint();
}

template <typename E>
void create_reloc_sections(Context<E> &ctx) {
  Timer t(ctx, "create_reloc_sections");

  // Create .rela.* sections
  tbb::parallel_for((i64)0, (i64)ctx.chunks.size(), [&](i64 i) {
    if (OutputSection<E> *osec = ctx.chunks[i]->to_osec())
      osec->reloc_sec.reset(new RelocSection<E>(ctx, *osec));
  });

  for (i64 i = 0, end = ctx.chunks.size(); i < end; i++)
    if (OutputSection<E> *osec = ctx.chunks[i]->to_osec())
      if (RelocSection<E> *x = osec->reloc_sec.get())
        ctx.chunks.push_back(x);
}

// Copy chunks to an output file
template <typename E>
void copy_chunks(Context<E> &ctx) {
  Timer t(ctx, "copy_chunks");

  auto copy = [&](Chunk<E> &chunk) {
    std::string name = chunk.name.empty() ? "(header)" : std::string(chunk.name);
    Timer t2(ctx, name, &t);
    chunk.copy_buf(ctx);
  };

  // For --relocatable and --emit-relocs, we want to copy non-relocation
  // sections first. This is because REL-type relocation sections (as
  // opposed to RELA-type) stores relocation addends to target sections.
  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
    if (chunk->shdr.sh_type != SHT_REL)
      copy(*chunk);
  });

  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
    if (chunk->shdr.sh_type == SHT_REL)
      copy(*chunk);
  });

  // Undefined symbols in SHF_ALLOC sections are found by scan_relocations(),
  // but those in non-SHF_ALLOC sections cannot be found until we copy section
  // contents. So we need to call this function again to report possible
  // undefined errors.
  report_undef_errors(ctx);

  if constexpr (is_arm32<E>)
    fixup_arm_exidx_section(ctx);
}

// Rewrite the leading endbr64 instruction with a nop if a function
// symbol's address was not taken.
template <typename E>
void rewrite_endbr(Context<E> &ctx) {
  Timer t(ctx, "rewrite_endbr");
  assert(is_x86_64<E>);

  // Compute address-taken bit for each symbol
  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
      if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC)) {
        for (const ElfRel<E> &rel : isec->get_rels(ctx)) {
          Symbol<E> &sym = *file->symbols[rel.r_sym];
          if (!is_func_call_rel(rel) && sym.esym().st_type == STT_FUNC) {
            std::scoped_lock lock(sym.mu);
            sym.address_taken = true;
          }
        }
      }
    }
  });

  // Exported symbols are conservatively assumed to be address-taken.
  if (ctx.dynsym)
    for (Symbol<E> *sym : ctx.dynsym->symbols)
      if (sym && sym->is_exported)
        sym->address_taken = true;

  // Some symbols are implicitly address-taken
  ctx.arg.entry->address_taken = true;
  ctx.arg.init->address_taken = true;
  ctx.arg.fini->address_taken = true;

  // Rewrite endbr64 with nop
  u8 endbr64[] = {0xf3, 0x0f, 0x1e, 0xfa};
  u8 nop[] = {0x0f, 0x1f, 0x40, 0x00};

  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    for (Symbol<E> *sym : file->symbols) {
      if (sym->file == file && sym->esym().st_type == STT_FUNC &&
          !sym->address_taken) {
        if (InputSection<E> *isec = sym->get_input_section()) {
          if (OutputSection<E> *osec = isec->output_section) {
            u8 *buf = ctx.buf + osec->shdr.sh_offset + isec->offset +
                      sym->value;
            if (memcmp(buf, endbr64, 4) == 0)
              memcpy(buf, nop, 4);
          }
        }
      }
    }
  });
}

template <typename E>
void construct_relr(Context<E> &ctx) {
  Timer t(ctx, "construct_relr");

  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
    chunk->construct_relr(ctx);
  });
}

template <typename E>
void create_output_symtab(Context<E> &ctx) {
  Timer t(ctx, "compute_symtab_size");

  if (!ctx.arg.strip_all && !ctx.arg.retain_symbols_file) {
    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
      chunk->compute_symtab_size(ctx);
    });
  }

  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    file->compute_symtab_size(ctx);
  });

  tbb::parallel_for_each(ctx.dsos, [&](SharedFile<E> *file) {
    file->compute_symtab_size(ctx);
  });
}

template <typename E>
void apply_version_script(Context<E> &ctx) {
  Timer t(ctx, "apply_version_script");

  // Assign versions to symbols specified with `extern "C++"` or
  // wildcard patterns first.
  MultiGlob matcher;
  MultiGlob cpp_matcher;

  // The "local:" label has a special meaning in the version script.
  // It can appear in any VERSION clause, and it hides matched symbols
  // unless other non-local patterns match to them. In other words,
  // "local:" has lower precedence than other version definitions.
  //
  // If two or more non-local patterns match to the same symbol, the
  // last one takes precedence.
  std::vector<VersionPattern> patterns = ctx.version_patterns;

  std::stable_partition(patterns.begin(), patterns.end(),
                        [](const VersionPattern &pat) {
    return pat.ver_idx == VER_NDX_LOCAL;
  });

  for (i64 i = 0; i < patterns.size(); i++) {
    VersionPattern &v = patterns[i];
    if (v.is_cpp) {
      if (!cpp_matcher.add(v.pattern, i))
        Fatal(ctx) << "invalid version pattern: " << v.pattern;
    } else if (v.pattern.find_first_of("*?[") != v.pattern.npos) {
      if (!matcher.add(v.pattern, i))
        Fatal(ctx) << "invalid version pattern: " << v.pattern;
    }
  }

  if (!matcher.empty() || !cpp_matcher.empty()) {
    tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
      for (Symbol<E> *sym : file->get_global_syms()) {
        if (sym->file != file)
          continue;

        std::string_view name = sym->name();
        i64 match = -1;

        if (std::optional<i64> idx = matcher.find(name))
          match = std::max(match, *idx);

        // Match non-mangled symbols against the C++ pattern as well.
        // Weird, but required to match other linkers' behavior.
        if (!cpp_matcher.empty()) {
          if (std::optional<std::string_view> s = demangle_cpp(name))
            name = *s;
          if (std::optional<i64> idx = cpp_matcher.find(name))
            match = std::max(match, *idx);
        }

        if (match != -1)
          sym->ver_idx = patterns[match].ver_idx;
      }
    });
  }

  // Next, assign versions to symbols specified by exact name.
  // In other words, exact matches have higher precedence over
  // wildcard or `extern "C++"` patterns.
  for (VersionPattern &v : patterns) {
    if (!v.is_cpp && v.pattern.find_first_of("*?[") == v.pattern.npos) {
      Symbol<E> *sym = get_symbol(ctx, v.pattern);

      if (!sym->file && !ctx.arg.undefined_version)
        Warn(ctx) << v.source << ": cannot assign version `" << v.ver_str
                  << "` to symbol `" << *sym << "`: symbol not found";

      if (sym->file && !sym->file->is_dso)
        sym->ver_idx = v.ver_idx;
    }
  }
}

template <typename E>
void parse_symbol_version(Context<E> &ctx) {
  if (!ctx.arg.shared)
    return;

  Timer t(ctx, "parse_symbol_version");

  std::unordered_map<std::string_view, u16> verdefs;
  for (i64 i = 0; i < ctx.arg.version_definitions.size(); i++)
    verdefs[ctx.arg.version_definitions[i]] = i + VER_NDX_LAST_RESERVED + 1;

  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    if (file == ctx.internal_obj)
      return;

    for (i64 i = file->first_global; i < file->elf_syms.size(); i++) {
      // Match VERSION part of symbol foo@VERSION with version definitions.
      if (!file->has_symver.get(i - file->first_global))
        continue;

      Symbol<E> *sym = file->symbols[i];
      if (sym->file != file)
        continue;

      const char *name = file->symbol_strtab.data() + file->elf_syms[i].st_name;
      std::string_view ver = strchr(name, '@') + 1;

      bool is_default = false;
      if (ver.starts_with('@')) {
        is_default = true;
        ver = ver.substr(1);
      }

      auto it = verdefs.find(ver);
      if (it == verdefs.end()) {
        Error(ctx) << *file << ": symbol " << *sym <<  " has undefined version "
                   << ver;
        continue;
      }

      sym->ver_idx = it->second;
      if (!is_default)
        sym->ver_idx |= VERSYM_HIDDEN;

      // If both symbol `foo` and `foo@VERSION` are defined, `foo@VERSION`
      // hides `foo` so that all references to `foo` are resolved to a
      // versioned symbol. Likewise, if `foo@VERSION` and `foo@@VERSION` are
      // defined, the default one takes precedence.
      Symbol<E> *sym2 = get_symbol(ctx, sym->name());
      if (sym2->file == file &&
          !file->has_symver.get(sym2->sym_idx - file->first_global))
        if (sym2->ver_idx == ctx.default_version ||
            (sym2->ver_idx & ~VERSYM_HIDDEN) == (sym->ver_idx & ~VERSYM_HIDDEN))
          sym2->ver_idx = VER_NDX_LOCAL;
    }
  });
}

template <typename E>
void compute_import_export(Context<E> &ctx) {
  Timer t(ctx, "compute_import_export");

  // If we are creating an executable, we want to export symbols referenced
  // by DSOs unless they are explicitly marked as local by a version script.
  if (!ctx.arg.shared) {
    tbb::parallel_for_each(ctx.dsos, [&](SharedFile<E> *file) {
      for (Symbol<E> *sym : file->symbols) {
        if (sym->file && !sym->file->is_dso && sym->visibility != STV_HIDDEN &&
            sym->ver_idx != VER_NDX_LOCAL) {
          std::scoped_lock lock(sym->mu);
          sym->is_exported = true;
        }
      }
    });
  }

  auto should_export = [&](Symbol<E> &sym) {
    if (sym.visibility == STV_HIDDEN)
      return false;

    switch (sym.ver_idx) {
    case VER_NDX_UNSPECIFIED:
      if (ctx.arg.shared)
        return !((ObjectFile<E> *)sym.file)->exclude_libs;
      return ctx.arg.export_dynamic;
    case VER_NDX_LOCAL:
      return false;
    default:
      return true;
    }
  };

  // Export symbols that are not hidden or marked as local.
  // We also want to mark imported symbols as such.
  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    for (Symbol<E> *sym : file->get_global_syms()) {
      // If we are using a symbol in a DSO, we need to import it.
      if (sym->file && sym->file->is_dso) {
        std::scoped_lock lock(sym->mu);
        sym->is_imported = true;
        continue;
      }

      // If we have a definition of a symbol, we may want to export it.
      if (sym->file == file && should_export(*sym)) {
        sym->is_exported = true;

        // Exported symbols are marked as imported as well by default
        // for DSOs.
        if (ctx.arg.shared &&
            sym->visibility != STV_PROTECTED &&
            !ctx.arg.Bsymbolic &&
            !(ctx.arg.Bsymbolic_functions && sym->get_type() == STT_FUNC))
          sym->is_imported = true;
      }
    }
  });


  // Apply --dynamic-list, --export-dynamic-symbol and
  // --export-dynamic-symbol-list options.
  //
  // The semantics of these options vary depending on whether we are
  // creating an executalbe or a shared object.
  //
  // For executable, matched symbols are exported.
  //
  // For shared objects, matched symbols are imported if it is already
  // exported so that they are interposable. In other words, symbols
  // that did not match will be bound locally within the output file,
  // effectively turning them into protected symbols.
  MultiGlob matcher;
  MultiGlob cpp_matcher;

  auto handle_match = [&](Symbol<E> *sym) {
    if (ctx.arg.shared) {
      if (sym->is_exported)
        sym->is_imported = true;
    } else {
      if (sym->file && !sym->file->is_dso && sym->visibility != STV_HIDDEN)
        sym->is_exported = true;
    }
  };

  for (DynamicPattern &p : ctx.dynamic_list_patterns) {
    if (p.is_cpp) {
      if (!cpp_matcher.add(p.pattern, 1))
        Fatal(ctx) << p.source << ": invalid dynamic list entry: "
                   << p.pattern;
      continue;
    }

    if (p.pattern.find_first_of("*?[") != p.pattern.npos) {
      if (!matcher.add(p.pattern, 1))
        Fatal(ctx) << p.source << ": invalid dynamic list entry: "
                   << p.pattern;
      continue;
    }

    handle_match(get_symbol(ctx, p.pattern));
  }

  if (!matcher.empty() || !cpp_matcher.empty()) {
    tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
      for (Symbol<E> *sym : file->get_global_syms()) {
        if (sym->file != file)
          continue;
        if (ctx.arg.shared && !sym->is_exported)
          continue;

        std::string_view name = sym->name();

        if (matcher.find(name)) {
          handle_match(sym);
        } else if (!cpp_matcher.empty()) {
          if (std::optional<std::string_view> s = demangle_cpp(name))
            name = *s;
          if (cpp_matcher.find(name))
            handle_match(sym);
        }
      }
    });
  }
}

// Compute the "address-taken" bit for each input section.
//
// As a space-saving optimization, we want to merge two read-only objects
// into a single object if their contents are equivalent. That
// optimization is called the Identical Code Folding or ICF.
//
// A catch is that comparing object contents is not enough to determine if
// two objects can be merged safely; we need to take care of pointer
// equivalence.
//
// In C/C++, two pointers are equivalent if and only if they are taken for
// the same object. Merging two objects into a single object can break
// this assumption because two distinctive pointers would become
// equivalent as a result of merging. We can still merge one object with
// another if no pointer to the object was taken in code, because without
// a pointer, comparing its address becomes moot.
//
// In mold, each input section has an "address-taken" bit. If there is a
// pointer-taking reference to the object, it's set to true. At the ICF
// stage, we merge only objects whose addresses were not taken.
//
// For functions, address-taking relocations are separated from
// non-address-taking ones. For example, x86-64 uses R_X86_64_PLT32 for
// direct function calls (e.g., "call foo" to call the function foo) while
// R_X86_64_PC32 or R_X86_64_GOT32 are used for pointer-taking operations.
//
// Unfortunately, for data, we can't distinguish between address-taking
// relocations and non-address-taking ones. LLVM generates an "address
// significance" table in the ".llvm_addrsig" section to mark symbols
// whose addresses are taken in code. If that table is available, we use
// that information in this function. Otherwise, we conservatively assume
// that all data items are address-taken.
template <typename E>
void compute_address_significance(Context<E> &ctx) {
  Timer t(ctx, "compute_address_significance");

  // Flip address-taken bit for executable sections first.
  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    for (std::unique_ptr<InputSection<E>> &src : file->sections)
      if (src && src->is_alive && (src->shdr().sh_flags & SHF_ALLOC))
        for (const ElfRel<E> &r : src->get_rels(ctx))
          if (!is_func_call_rel(r))
            if (InputSection<E> *dst = file->symbols[r.r_sym]->get_input_section())
              if (dst->shdr().sh_flags & SHF_EXECINSTR)
                dst->address_taken = true;
  });

  auto mark = [](Symbol<E> *sym) {
    if (sym)
      if (InputSection<E> *isec = sym->get_input_section())
        isec->address_taken = true;
  };

  // Some symbols' pointer values are leaked to the dynamic section.
  mark(ctx.arg.entry);
  mark(ctx.arg.init);
  mark(ctx.arg.fini);

  // Exported symbols are conservatively considered address-taken.
  if (ctx.dynsym)
    for (Symbol<E> *sym : ctx.dynsym->symbols)
      if (sym && sym->is_exported)
        mark(sym);

  // Handle data objects.
  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
    if (InputSection<E> *sec = file->llvm_addrsig.get()) {
      u8 *p = (u8 *)sec->contents.data();
      u8 *end = p + sec->contents.size();
      while (p != end)
        mark(file->symbols[read_uleb(&p)]);
    } else {
      for (std::unique_ptr<InputSection<E>> &isec : file->sections)
        if (isec && !(isec->shdr().sh_flags & SHF_EXECINSTR))
          isec->address_taken = true;
    }
  });
}

template <typename E>
void clear_padding(Context<E> &ctx) {
  Timer t(ctx, "clear_padding");

  auto zero = [&](Chunk<E> *chunk, i64 next_start) {
    i64 pos = chunk->shdr.sh_offset + chunk->shdr.sh_size;
    memset(ctx.buf + pos, 0, next_start - pos);
  };

  std::vector<Chunk<E> *> chunks = ctx.chunks;

  std::erase_if(chunks, [](Chunk<E> *chunk) {
    return chunk->shdr.sh_type == SHT_NOBITS;
  });

  for (i64 i = 1; i < chunks.size(); i++)
    zero(chunks[i - 1], chunks[i]->shdr.sh_offset);
  zero(chunks.back(), ctx.output_file->filesize);
}

// We want to sort output chunks in the following order.
//
//   <ELF header>
//   <program header>
//   .interp
//   .note
//   .hash
//   .gnu.hash
//   .dynsym
//   .dynstr
//   .gnu.version
//   .gnu.version_r
//   .rela.dyn
//   .rela.plt
//   <readonly data>
//   <readonly code>
//   <writable tdata>
//   <writable tbss>
//   <writable RELRO data>
//   .got
//   .toc
//   .alpha_got
//   <writable RELRO bss>
//   .relro_padding
//   <writable non-RELRO data>
//   <writable non-RELRO bss>
//   <non-memory-allocated sections>
//   <section header>
//   .gdb_index
//
// .interp and some other linker-synthesized sections are placed at the
// beginning of a file because they are needed by loader. Especially on
// a hard drive with spinning disks, it is important to read these
// sections in a single seek.
//
// .note sections are also placed at the beginning so that they are
// included in a core crash dump even if it's truncated by ulimit. In
// particular, if .note.gnu.build-id is in a truncated core file, you
// can at least identify which executable has crashed.
//
// .gdb_index cannot be constructed before applying relocations to
// other debug sections, so we create it after completing other part
// of the output file and append it to the very end of the file.
//
// A PT_NOTE segment will contain multiple .note sections if exists,
// but there's no way to represent a gap between .note sections.
// Therefore, we sort .note sections by decreasing alignment
// requirement. I believe each .note section size is a multiple of its
// alignment, so by sorting them by alignment, we should be able to
// avoid a gap between .note sections.
//
// .toc is placed right after .got for PPC64. PPC-specific .toc section
// contains data that may be accessed with a 16-bit offset relative to
// %r2. %r2 is set to .got + 32 KiB. Therefore, .toc needs to be within
// [.got, .got + 64 KiB).
//
// Other file layouts are possible, but this layout is chosen to keep
// the number of segments as few as possible.
template <typename E>
void sort_output_sections_regular(Context<E> &ctx) {
  auto get_rank1 = [&](Chunk<E> *chunk) {
    u64 type = chunk->shdr.sh_type;
    u64 flags = chunk->shdr.sh_flags;

    if (chunk == ctx.ehdr)
      return 0;
    if (chunk == ctx.phdr)
      return 1;
    if (chunk == ctx.interp)
      return 2;
    if (type == SHT_NOTE && (flags & SHF_ALLOC))
      return 3;
    if (chunk == ctx.hash)
      return 4;
    if (chunk == ctx.gnu_hash)
      return 5;
    if (chunk == ctx.dynsym)
      return 6;
    if (chunk == ctx.dynstr)
      return 7;
    if (chunk == ctx.versym)
      return 8;
    if (chunk == ctx.verneed)
      return 9;
    if (chunk == ctx.reldyn)
      return 10;
    if (chunk == ctx.relplt)
      return 11;
    if (chunk == ctx.shdr)
      return INT32_MAX - 1;
    if (chunk == ctx.gdb_index)
      return INT32_MAX;

    bool alloc = (flags & SHF_ALLOC);
    bool writable = (flags & SHF_WRITE);
    bool exec = (flags & SHF_EXECINSTR);
    bool tls = (flags & SHF_TLS);
    bool relro = chunk->is_relro;
    bool is_bss = (type == SHT_NOBITS);

    return (1 << 10) | (!alloc << 9) | (writable << 8) | (exec << 7) |
           (!tls << 6) | (!relro << 5) | (is_bss << 4);
  };

  // Ties are broken by additional rules
  auto get_rank2 = [&](Chunk<E> *chunk) -> i64 {
    ElfShdr<E> &shdr = chunk->shdr;
    if (shdr.sh_type == SHT_NOTE)
      return -shdr.sh_addralign;

    if (chunk == ctx.got)
      return 2;
    if (chunk->name == ".toc")
      return 3;
    if (chunk->name == ".alpha_got")
      return 4;
    if (chunk == ctx.relro_padding)
      return INT64_MAX;
    return 0;
  };

  sort(ctx.chunks, [&](Chunk<E> *a, Chunk<E> *b) {
    return std::tuple{get_rank1(a), get_rank2(a), a->name} <
           std::tuple{get_rank1(b), get_rank2(b), b->name};
  });
}

template <typename E>
static std::string_view get_section_order_group(Chunk<E> &chunk) {
  if (chunk.shdr.sh_type == SHT_NOBITS)
    return "BSS";
  if (chunk.shdr.sh_flags & SHF_EXECINSTR)
    return "TEXT";
  if (chunk.shdr.sh_flags & SHF_WRITE)
    return "DATA";
  return "RODATA";
};

// Sort sections according to a --section-order argument.
template <typename E>
void sort_output_sections_by_order(Context<E> &ctx) {
  auto get_rank = [&](Chunk<E> *chunk) -> i64 {
    u64 flags = chunk->shdr.sh_flags;

    if (chunk == ctx.ehdr && !(chunk->shdr.sh_flags & SHF_ALLOC))
      return -2;
    if (chunk == ctx.phdr && !(chunk->shdr.sh_flags & SHF_ALLOC))
      return -1;

    if (chunk == ctx.shdr)
      return INT32_MAX;
    if (!(flags & SHF_ALLOC))
      return INT32_MAX - 1;

    for (i64 i = 0; const SectionOrder &arg : ctx.arg.section_order) {
      if (arg.type == SectionOrder::SECTION && arg.name == chunk->name)
        return i;
      i++;
    }

    std::string_view group = get_section_order_group(*chunk);

    for (i64 i = 0; i < ctx.arg.section_order.size(); i++) {
      SectionOrder arg = ctx.arg.section_order[i];
      if (arg.type == SectionOrder::GROUP && arg.name == group)
        return i;
    }

    Error(ctx) << "--section-order: missing section specification for "
               << chunk->name;
    return 0;
  };

  // It is an error if a section order cannot be determined by a given
  // section order list.
  for (Chunk<E> *chunk : ctx.chunks)
    chunk->sect_order = get_rank(chunk);

  // Sort output sections by --section-order
  sort(ctx.chunks, [&](Chunk<E> *a, Chunk<E> *b) {
    return a->sect_order < b->sect_order;
  });
}

template <typename E>
void sort_output_sections(Context<E> &ctx) {
  if (ctx.arg.section_order.empty())
    sort_output_sections_regular(ctx);
  else
    sort_output_sections_by_order(ctx);
}

template <typename E>
static bool is_tbss(Chunk<E> *chunk) {
  return (chunk->shdr.sh_type == SHT_NOBITS) && (chunk->shdr.sh_flags & SHF_TLS);
}

// This function assigns virtual addresses to output sections. Assigning
// addresses is a bit tricky because we want to pack sections as tightly
// as possible while not violating the constraints imposed by the hardware
// and the OS kernel. Specifically, we need to satisfy the following
// constraints:
//
// - Memory protection (readable, writable and executable) works at page
//   granularity. Therefore, if we want to set different memory attributes
//   to two sections, we need to place them into separate pages.
//
// - The ELF spec requires that a section's file offset is congruent to
//   its virtual address modulo the page size. For example, a section at
//   virtual address 0x401234 on x86-64 (4 KiB, or 0x1000 byte page
//   system) can be at file offset 0x3234 or 0x50234 but not at 0x1000.
//
// We need to insert paddings between sections if we can't satisfy the
// above constraints without them.
//
// We don't want to waste too much memory and disk space for paddings.
// There are a few tricks we can use to minimize paddings as below:
//
// - We want to place sections with the same memory attributes
//   contiguous as possible.
//
// - We can map the same file region to memory more than once. For
//   example, we can write code (with R and X bits) and read-only data
//   (with R bit) adjacent on file and map it twice as the last page of
//   the executable segment and the first page of the read-only data
//   segment. This doesn't save memory but saves disk space.
template <typename E>
static void set_virtual_addresses_regular(Context<E> &ctx) {
  constexpr i64 RELRO = 1LL << 32;

  auto get_flags = [&](Chunk<E> *chunk) {
    i64 flags = to_phdr_flags(ctx, chunk);
    if (chunk->is_relro)
      return flags | RELRO;
    return flags;
  };

  // Assign virtual addresses
  std::vector<Chunk<E> *> &chunks = ctx.chunks;
  u64 addr = ctx.arg.image_base;

  // TLS chunks alignments are special: in addition to having their virtual
  // addresses aligned, they also have to be aligned when the region of
  // tls_begin is copied to a new thread's storage area. In other words, their
  // offset against tls_begin also has to be aligned.
  //
  // A good way to achieve this is to take the largest alignment requirement
  // of all TLS sections and make tls_begin also aligned to that.
  Chunk<E> *first_tls_chunk = nullptr;
  u64 tls_alignment = 1;
  for (Chunk<E> *chunk : chunks) {
    if (chunk->shdr.sh_flags & SHF_TLS) {
      if (!first_tls_chunk)
        first_tls_chunk = chunk;
      tls_alignment = std::max(tls_alignment, (u64)chunk->shdr.sh_addralign);
    }
  }

  auto alignment = [&](Chunk<E> *chunk) {
    return chunk == first_tls_chunk ? tls_alignment : (u64)chunk->shdr.sh_addralign;
  };

  for (i64 i = 0; i < chunks.size(); i++) {
    if (!(chunks[i]->shdr.sh_flags & SHF_ALLOC))
      continue;

    // .relro_padding is a padding section to extend a PT_GNU_RELRO
    // segment to cover an entire page. Technically, we don't need a
    // .relro_padding section because we can leave a trailing part of a
    // segment an unused space. However, the `strip` command would delete
    // such an unused trailing part and make an executable invalid.
    // So we add a dummy section.
    if (chunks[i] == ctx.relro_padding) {
      chunks[i]->shdr.sh_addr = addr;
      chunks[i]->shdr.sh_size = align_to(addr, ctx.page_size) - addr;
      addr += ctx.page_size;
      continue;
    }

    // Handle --section-start first
    if (auto it = ctx.arg.section_start.find(chunks[i]->name);
        it != ctx.arg.section_start.end()) {
      addr = it->second;
      chunks[i]->shdr.sh_addr = addr;
      addr += chunks[i]->shdr.sh_size;
      continue;
    }

    // Memory protection works at page size granularity. We need to
    // put sections with different memory attributes into different
    // pages. We do it by inserting paddings here.
    if (i > 0 && chunks[i - 1] != ctx.relro_padding) {
      i64 flags1 = get_flags(chunks[i - 1]);
      i64 flags2 = get_flags(chunks[i]);

      if (!ctx.arg.nmagic && flags1 != flags2) {
        switch (ctx.arg.z_separate_code) {
        case SEPARATE_LOADABLE_SEGMENTS:
          addr = align_to(addr, ctx.page_size);
          break;
        case SEPARATE_CODE:
          if ((flags1 & PF_X) != (flags2 & PF_X)) {
            addr = align_to(addr, ctx.page_size);
            break;
          }
          [[fallthrough]];
        case NOSEPARATE_CODE:
          if (addr % ctx.page_size != 0)
            addr += ctx.page_size;
          break;
        default:
          unreachable();
        }
      }
    }

    // TLS BSS sections are laid out so that they overlap with the
    // subsequent non-tbss sections. Overlapping is fine because a STT_TLS
    // segment contains an initialization image for newly-created threads,
    // and no one except the runtime reads its contents. Even the runtime
    // doesn't need a BSS part of a TLS initialization image; it just
    // leaves zero-initialized bytes as-is instead of copying zeros.
    // So no one really read tbss at runtime.
    //
    // We can instead allocate a dedicated virtual address space to tbss,
    // but that would be just a waste of the address and disk space.
    if (is_tbss(chunks[i])) {
      u64 addr2 = addr;
      for (;;) {
        addr2 = align_to(addr2, alignment(chunks[i]));
        chunks[i]->shdr.sh_addr = addr2;
        addr2 += chunks[i]->shdr.sh_size;
        if (i + 2 == chunks.size() || !is_tbss(chunks[i + 1]))
          break;
        i++;
      }
      continue;
    }

    addr = align_to(addr, alignment(chunks[i]));
    chunks[i]->shdr.sh_addr = addr;
    addr += chunks[i]->shdr.sh_size;
  }
}

template <typename E>
static void set_virtual_addresses_by_order(Context<E> &ctx) {
  std::vector<Chunk<E> *> &c = ctx.chunks;
  u64 addr = ctx.arg.image_base;
  i64 i = 0;

  while (i < c.size() && !(c[i]->shdr.sh_flags & SHF_ALLOC))
    i++;

  auto assign_addr = [&] {
    if (i != 0) {
      i64 flags1 = to_phdr_flags(ctx, c[i - 1]);
      i64 flags2 = to_phdr_flags(ctx, c[i]);

      // Memory protection works at page size granularity. We need to
      // put sections with different memory attributes into different
      // pages. We do it by inserting paddings here.
      if (flags1 != flags2) {
        switch (ctx.arg.z_separate_code) {
        case SEPARATE_LOADABLE_SEGMENTS:
          addr = align_to(addr, ctx.page_size);
          break;
        case SEPARATE_CODE:
          if ((flags1 & PF_X) != (flags2 & PF_X))
            addr = align_to(addr, ctx.page_size);
          break;
        default:
          break;
        }
      }
    }

    addr = align_to(addr, c[i]->shdr.sh_addralign);
    c[i]->shdr.sh_addr = addr;
    addr += c[i]->shdr.sh_size;

    do {
      i++;
    } while (i < c.size() && !(c[i]->shdr.sh_flags & SHF_ALLOC));
  };

  for (i64 j = 0; j < ctx.arg.section_order.size(); j++) {
    SectionOrder &ord = ctx.arg.section_order[j];
    switch (ord.type) {
    case SectionOrder::SECTION:
      if (i < c.size() && j == c[i]->sect_order)
        assign_addr();
      break;
    case SectionOrder::GROUP:
      while (i < c.size() && j == c[i]->sect_order)
        assign_addr();
      break;
    case SectionOrder::ADDR:
      addr = ord.value;
      break;
    case SectionOrder::ALIGN:
      addr = align_to(addr, ord.value);
      break;
    case SectionOrder::SYMBOL:
      get_symbol(ctx, ord.name)->value = addr;
      break;
    default:
      unreachable();
    }
  }
}

// Returns the smallest integer N that satisfies N >= val and
// N mod align == skew mod align.
//
// Section's file offset must be congruent to its virtual address modulo
// the page size. We use this function to satisfy that requirement.
static u64 align_with_skew(u64 val, u64 align, u64 skew) {
  u64 x = align_down(val, align) + skew % align;
  return (val <= x) ? x : x + align;
}

// Assign file offsets to output sections.
template <typename E>
static i64 set_file_offsets(Context<E> &ctx) {
  std::vector<Chunk<E> *> &chunks = ctx.chunks;
  u64 fileoff = 0;
  i64 i = 0;

  while (i < chunks.size()) {
    Chunk<E> &first = *chunks[i];

    if (!(first.shdr.sh_flags & SHF_ALLOC)) {
      fileoff = align_to(fileoff, first.shdr.sh_addralign);
      first.shdr.sh_offset = fileoff;
      fileoff += first.shdr.sh_size;
      i++;
      continue;
    }

    if (first.shdr.sh_type == SHT_NOBITS) {
      first.shdr.sh_offset = fileoff;
      i++;
      continue;
    }

    if (first.shdr.sh_addralign > ctx.page_size)
      fileoff = align_to(fileoff, first.shdr.sh_addralign);
    else
      fileoff = align_with_skew(fileoff, ctx.page_size, first.shdr.sh_addr);

    // Assign ALLOC sections contiguous file offsets as long as they
    // are contiguous in memory.
    for (;;) {
      chunks[i]->shdr.sh_offset =
        fileoff + chunks[i]->shdr.sh_addr - first.shdr.sh_addr;
      i++;

      if (i >= chunks.size() ||
          !(chunks[i]->shdr.sh_flags & SHF_ALLOC) ||
          chunks[i]->shdr.sh_type == SHT_NOBITS)
        break;

      // If --start-section is given, addresses may not increase
      // monotonically.
      if (chunks[i]->shdr.sh_addr < first.shdr.sh_addr)
        break;

      i64 gap_size = chunks[i]->shdr.sh_addr - chunks[i - 1]->shdr.sh_addr -
                     chunks[i - 1]->shdr.sh_size;

      // If --start-section is given, there may be a large gap between
      // sections. We don't want to allocate a disk space for a gap if
      // exists.
      if (gap_size >= ctx.page_size)
        break;
    }

    fileoff = chunks[i - 1]->shdr.sh_offset + chunks[i - 1]->shdr.sh_size;

    while (i < chunks.size() &&
           (chunks[i]->shdr.sh_flags & SHF_ALLOC) &&
           chunks[i]->shdr.sh_type == SHT_NOBITS) {
      chunks[i]->shdr.sh_offset = fileoff;
      i++;
    }
  }

  return fileoff;
}

template <typename E>
void compute_section_headers(Context<E> &ctx) {
  // Update sh_size for each chunk.
  for (Chunk<E> *chunk : ctx.chunks)
    chunk->update_shdr(ctx);

  // Remove empty chunks.
  std::erase_if(ctx.chunks, [&](Chunk<E> *chunk) {
    return !chunk->to_osec() && chunk != ctx.gdb_index &&
           chunk->shdr.sh_size == 0;
  });

  // Set section indices.
  i64 shndx = 1;
  for (i64 i = 0; i < ctx.chunks.size(); i++)
    if (!ctx.chunks[i]->is_header())
      ctx.chunks[i]->shndx = shndx++;

  if (ctx.symtab && SHN_LORESERVE <= shndx) {
    SymtabShndxSection<E> *sec = new SymtabShndxSection<E>;
    sec->shndx = shndx++;
    sec->shdr.sh_link = ctx.symtab->shndx;
    ctx.symtab_shndx = sec;
    ctx.chunks.push_back(sec);
    ctx.chunk_pool.emplace_back(sec);
  }

  if (ctx.shdr)
    ctx.shdr->shdr.sh_size = shndx * sizeof(ElfShdr<E>);

  // Some types of section header refer other section by index.
  // Recompute the section header to fill such fields with correct values.
  for (Chunk<E> *chunk : ctx.chunks)
    chunk->update_shdr(ctx);

  if (ctx.symtab_shndx) {
    i64 symtab_size = ctx.symtab->shdr.sh_size / sizeof(ElfSym<E>);
    ctx.symtab_shndx->shdr.sh_size = symtab_size * 4;
  }
}

// Assign virtual addresses and file offsets to output sections.
template <typename E>
i64 set_osec_offsets(Context<E> &ctx) {
  Timer t(ctx, "set_osec_offsets");

  for (;;) {
    if (ctx.arg.section_order.empty())
      set_virtual_addresses_regular(ctx);
    else
      set_virtual_addresses_by_order(ctx);

    // Assigning new offsets may change the contents and the length
    // of the program header, so repeat it until converge.
    i64 fileoff = set_file_offsets(ctx);

    if (ctx.phdr) {
      i64 sz = ctx.phdr->shdr.sh_size;
      ctx.phdr->update_shdr(ctx);
      if (sz != ctx.phdr->shdr.sh_size)
        continue;
    }

    return fileoff;
  }
}

template <typename E>
static i64 get_num_irelative_relocs(Context<E> &ctx) {
  i64 n = std::count_if(ctx.got->got_syms.begin(), ctx.got->got_syms.end(),
                        [](Symbol<E> *sym) { return sym->is_ifunc(); });
  return n + ctx.num_ifunc_dynrels;
}

template <typename E>
static u64 to_paddr(Context<E> &ctx, u64 vaddr) {
  for (ElfPhdr<E> &phdr : ctx.phdr->phdrs)
    if (phdr.p_type == PT_LOAD)
      if (phdr.p_vaddr <= vaddr && vaddr < phdr.p_vaddr + phdr.p_memsz)
        return phdr.p_paddr + (vaddr - phdr.p_vaddr);
  return 0;
}

template <typename E>
void fix_synthetic_symbols(Context<E> &ctx) {
  auto start = [](Symbol<E> *sym, auto &chunk, i64 bias = 0) {
    if (sym && chunk) {
      sym->set_output_section(chunk);
      sym->value = chunk->shdr.sh_addr + bias;
    }
  };

  auto stop = [](Symbol<E> *sym, auto &chunk) {
    if (sym && chunk) {
      sym->set_output_section(chunk);
      sym->value = chunk->shdr.sh_addr + chunk->shdr.sh_size;
    }
  };

  std::vector<Chunk<E> *> sections;
  for (Chunk<E> *chunk : ctx.chunks)
    if (!chunk->is_header() && (chunk->shdr.sh_flags & SHF_ALLOC))
      sections.push_back(chunk);

  auto find = [&](std::string name) -> Chunk<E> * {
    for (Chunk<E> *chunk : sections)
      if (chunk->name == name)
        return chunk;
    return nullptr;
  };

  // __bss_start
  if (Chunk<E> *chunk = find(".bss"))
    start(ctx.__bss_start, chunk);

  if (ctx.ehdr && (ctx.ehdr->shdr.sh_flags & SHF_ALLOC)) {
    ctx.__ehdr_start->set_output_section(sections[0]);
    ctx.__ehdr_start->value = ctx.ehdr->shdr.sh_addr;
    ctx.__executable_start->set_output_section(sections[0]);
    ctx.__executable_start->value = ctx.ehdr->shdr.sh_addr;
  }

  if (ctx.__dso_handle) {
    ctx.__dso_handle->set_output_section(sections[0]);
    ctx.__dso_handle->value = sections[0]->shdr.sh_addr;
  }

  // __rel_iplt_start and __rel_iplt_end. These symbols need to be
  // defined in a statically-linked non-relocatable executable because
  // such executable lacks the .dynamic section and thus there's no way
  // to find ifunc relocations other than these symbols.
  //
  // We don't want to set values to these symbols if we are creating a
  // static PIE due to a glibc bug. Static PIE has a dynamic section.
  // If we set values to these symbols in a static PIE, glibc attempts
  // to run ifunc initializers twice, with the second attempt with wrong
  // function addresses, causing a segmentation fault.
  if (ctx.reldyn && ctx.arg.is_static && !ctx.arg.pie) {
    stop(ctx.__rel_iplt_start, ctx.reldyn);
    stop(ctx.__rel_iplt_end, ctx.reldyn);

    ctx.__rel_iplt_start->value -=
      get_num_irelative_relocs(ctx) * sizeof(ElfRel<E>);
  }

  // __{init,fini}_array_{start,end}
  for (Chunk<E> *chunk : sections) {
    switch (chunk->shdr.sh_type) {
    case SHT_INIT_ARRAY:
      start(ctx.__init_array_start, chunk);
      stop(ctx.__init_array_end, chunk);
      break;
    case SHT_PREINIT_ARRAY:
      start(ctx.__preinit_array_start, chunk);
      stop(ctx.__preinit_array_end, chunk);
      break;
    case SHT_FINI_ARRAY:
      start(ctx.__fini_array_start, chunk);
      stop(ctx.__fini_array_end, chunk);
      break;
    }
  }

  // _end, _etext, _edata and the like
  for (Chunk<E> *chunk : sections) {
    if (chunk->shdr.sh_flags & SHF_ALLOC) {
      stop(ctx._end, chunk);
      stop(ctx.end, chunk);
    }

    if (chunk->shdr.sh_flags & SHF_EXECINSTR) {
      stop(ctx._etext, chunk);
      stop(ctx.etext, chunk);
    }

    if (chunk->shdr.sh_type != SHT_NOBITS &&
        (chunk->shdr.sh_flags & SHF_ALLOC)) {
      stop(ctx._edata, chunk);
      stop(ctx.edata, chunk);
    }
  }

  // _DYNAMIC
  start(ctx._DYNAMIC, ctx.dynamic);

  // _GLOBAL_OFFSET_TABLE_. I don't know why, but for the sake of
  // compatibility with existing code, it must be set to the beginning of
  // .got.plt instead of .got only on i386 and x86-64.
  if constexpr (is_x86<E>)
    start(ctx._GLOBAL_OFFSET_TABLE_, ctx.gotplt);
  else
    start(ctx._GLOBAL_OFFSET_TABLE_, ctx.got);

  // _PROCEDURE_LINKAGE_TABLE_. We need this on SPARC.
  start(ctx._PROCEDURE_LINKAGE_TABLE_, ctx.plt);

  // _TLS_MODULE_BASE_. This symbol is used to obtain the address of
  // the TLS block in the TLSDESC model. I believe GCC and Clang don't
  // create a reference to it, but Intel compiler seems to be using
  // this symbol.
  if (ctx._TLS_MODULE_BASE_) {
    ctx._TLS_MODULE_BASE_->set_output_section(sections[0]);
    ctx._TLS_MODULE_BASE_->value = ctx.dtp_addr;
  }

  // __GNU_EH_FRAME_HDR
  start(ctx.__GNU_EH_FRAME_HDR, ctx.eh_frame_hdr);

  // RISC-V's __global_pointer$
  if (ctx.__global_pointer) {
    if (Chunk<E> *chunk = find(".sdata")) {
      start(ctx.__global_pointer, chunk, 0x800);
    } else {
      ctx.__global_pointer->set_output_section(sections[0]);
      ctx.__global_pointer->value = 0;
    }
  }

  // ARM32's __exidx_{start,end}
  if (ctx.__exidx_start) {
    if (Chunk<E> *chunk = find(".ARM.exidx")) {
      start(ctx.__exidx_start, chunk);
      stop(ctx.__exidx_end, chunk);
    }
  }

  // PPC64's ".TOC." symbol.
  if constexpr (is_ppc64<E>) {
    if (Chunk<E> *chunk = find(".got")) {
      start(ctx.extra.TOC, chunk, 0x8000);
    } else if (Chunk<E> *chunk = find(".toc")) {
      start(ctx.extra.TOC, chunk, 0x8000);
    } else {
      ctx.extra.TOC->set_output_section(sections[0]);
      ctx.extra.TOC->value = 0;
    }
  }

  // PPC64's _{save,rest}gpr{0,1}_{14,15,16,...,31} symbols
  if constexpr (is_ppc64v2<E>) {
    i64 offset = 0;
    for (auto [label, insn] : ppc64_save_restore_insns) {
      if (!label.empty())
        if (Symbol<E> *sym = get_symbol(ctx, label);
            sym->file == ctx.internal_obj)
          start(sym, ctx.extra.save_restore, offset);
      offset += 4;
    }
  }

  // __start_ and __stop_ symbols
  for (Chunk<E> *chunk : sections) {
    if (std::optional<std::string> name = get_start_stop_name(ctx, *chunk)) {
      start(get_symbol(ctx, save_string(ctx, "__start_" + *name)), chunk);
      stop(get_symbol(ctx, save_string(ctx, "__stop_" + *name)), chunk);

      if (ctx.arg.physical_image_base) {
        u64 paddr = to_paddr(ctx, chunk->shdr.sh_addr);

        Symbol<E> *x = get_symbol(ctx, save_string(ctx, "__phys_start_" + *name));
        x->set_output_section(chunk);
        x->value = paddr;

        Symbol<E> *y = get_symbol(ctx, save_string(ctx, "__phys_stop_" + *name));
        y->set_output_section(chunk);
        y->value = paddr + chunk->shdr.sh_size;
      }
    }
  }

  // --defsym=sym=value symbols
  for (i64 i = 0; i < ctx.arg.defsyms.size(); i++) {
    Symbol<E> *sym = ctx.arg.defsyms[i].first;
    std::variant<Symbol<E> *, u64> val = ctx.arg.defsyms[i].second;

    if (u64 *addr = std::get_if<u64>(&val)) {
      sym->origin = 0;
      sym->value = *addr;
    } else {
      Symbol<E> *sym2 = std::get<Symbol<E> *>(val);
      sym->value = sym2->value;
      sym->origin = sym2->origin;
      sym->visibility = sym2->visibility.load();
    }
  }


  // --section-order symbols
  for (SectionOrder &ord : ctx.arg.section_order)
    if (ord.type == SectionOrder::SYMBOL)
      get_symbol(ctx, ord.name)->set_output_section(sections[0]);
}

template <typename E>
i64 compress_debug_sections(Context<E> &ctx) {
  Timer t(ctx, "compress_debug_sections");

  tbb::parallel_for((i64)0, (i64)ctx.chunks.size(), [&](i64 i) {
    Chunk<E> &chunk = *ctx.chunks[i];

    if ((chunk.shdr.sh_flags & SHF_ALLOC) || chunk.shdr.sh_size == 0 ||
        !chunk.name.starts_with(".debug"))
      return;

    Chunk<E> *comp = new CompressedSection<E>(ctx, chunk);
    ctx.chunk_pool.emplace_back(comp);
    ctx.chunks[i] = comp;
  });

  if (ctx.shstrtab)
    ctx.shstrtab->update_shdr(ctx);

  if (ctx.ehdr)
    ctx.ehdr->update_shdr(ctx);
  if (ctx.shdr)
    ctx.shdr->update_shdr(ctx);

  return set_osec_offsets(ctx);
}

// Write Makefile-style dependency rules to a file specified by
// --dependency-file. This is analogous to the compiler's -M flag.
template <typename E>
void write_dependency_file(Context<E> &ctx) {
  std::vector<std::string> deps;
  std::unordered_set<std::string> seen;

  for (std::unique_ptr<MappedFile> &mf : ctx.mf_pool)
    if (!mf->parent)
      if (std::string path = path_clean(mf->name); seen.insert(path).second)
        deps.push_back(path);

  std::ofstream out;
  out.open(ctx.arg.dependency_file);
  if (out.fail())
    Fatal(ctx) << "--dependency-file: cannot open " << ctx.arg.dependency_file
               << ": " << errno_string();

  out << ctx.arg.output << ":";
  for (std::string &s : deps)
    out << " " << s;
  out << "\n";

  for (std::string &s : deps)
    out << "\n" << s << ":\n";
  out.close();
}

template <typename E>
void show_stats(Context<E> &ctx) {
  for (ObjectFile<E> *obj : ctx.objs) {
    static Counter defined("defined_syms");
    defined += obj->first_global - 1;

    static Counter undefined("undefined_syms");
    undefined += obj->symbols.size() - obj->first_global;

    for (std::unique_ptr<InputSection<E>> &sec : obj->sections) {
      if (!sec || !sec->is_alive)
        continue;

      static Counter alloc("reloc_alloc");
      static Counter nonalloc("reloc_nonalloc");

      if (sec->shdr().sh_flags & SHF_ALLOC)
        alloc += sec->get_rels(ctx).size();
      else
        nonalloc += sec->get_rels(ctx).size();
    }

    static Counter comdats("comdats");
    comdats += obj->comdat_groups.size();

    static Counter removed_comdats("removed_comdat_mem");
    for (ComdatGroupRef<E> &ref : obj->comdat_groups)
      if (ref.group->owner != obj->priority)
        removed_comdats += ref.members.size();

    static Counter num_cies("num_cies");
    num_cies += obj->cies.size();

    static Counter num_unique_cies("num_unique_cies");
    for (CieRecord<E> &cie : obj->cies)
      if (cie.is_leader)
        num_unique_cies++;

    static Counter num_fdes("num_fdes");
    num_fdes +=  obj->fdes.size();
  }

  static Counter num_bytes("total_input_bytes");
  for (std::unique_ptr<MappedFile> &mf : ctx.mf_pool)
    num_bytes += mf->size;

  static Counter num_input_sections("input_sections");
  for (ObjectFile<E> *file : ctx.objs)
    num_input_sections += file->sections.size();

  static Counter num_output_chunks("output_chunks", ctx.chunks.size());
  static Counter num_objs("num_objs", ctx.objs.size());
  static Counter num_dsos("num_dsos", ctx.dsos.size());

  if constexpr (needs_thunk<E>) {
    static Counter thunk_bytes("thunk_bytes");
    for (Chunk<E> *chunk : ctx.chunks)
      if (OutputSection<E> *osec = chunk->to_osec())
        for (std::unique_ptr<Thunk<E>> &thunk : osec->thunks)
          thunk_bytes += thunk->size();
  }

  Counter::print();

  for (std::unique_ptr<MergedSection<E>> &sec : ctx.merged_sections)
    sec->print_stats(ctx);
}

using E = MOLD_TARGET;

template int redo_main(Context<E> &, int, char **);
template void create_internal_file(Context<E> &);
template void apply_exclude_libs(Context<E> &);
template void create_synthetic_sections(Context<E> &);
template void resolve_symbols(Context<E> &);
template void kill_eh_frame_sections(Context<E> &);
template void resolve_section_pieces(Context<E> &);
template void convert_common_symbols(Context<E> &);
template void compute_merged_section_sizes(Context<E> &);
template void create_output_sections(Context<E> &);
template void add_synthetic_symbols(Context<E> &);
template void check_cet_errors(Context<E> &);
template void print_dependencies(Context<E> &);
template void write_repro_file(Context<E> &);
template void check_duplicate_symbols(Context<E> &);
template void check_symbol_types(Context<E> &);
template void sort_init_fini(Context<E> &);
template void sort_ctor_dtor(Context<E> &);
template void fixup_ctors_in_init_array(Context<E> &);
template void shuffle_sections(Context<E> &);
template void compute_section_sizes(Context<E> &);
template void sort_output_sections(Context<E> &);
template void claim_unresolved_symbols(Context<E> &);
template void compute_imported_symbol_weakness(Context<E> &);
template void scan_relocations(Context<E> &);
template void report_undef_errors(Context<E> &);
template void create_reloc_sections(Context<E> &);
template void copy_chunks(Context<E> &);
template void rewrite_endbr(Context<E> &);
template void construct_relr(Context<E> &);
template void create_output_symtab(Context<E> &);
template void apply_version_script(Context<E> &);
template void parse_symbol_version(Context<E> &);
template void compute_import_export(Context<E> &);
template void compute_address_significance(Context<E> &);
template void clear_padding(Context<E> &);
template void compute_section_headers(Context<E> &);
template i64 set_osec_offsets(Context<E> &);
template void fix_synthetic_symbols(Context<E> &);
template i64 compress_debug_sections(Context<E> &);
template void write_dependency_file(Context<E> &);
template void show_stats(Context<E> &);

} // namespace mold::elf