Merge 9ab4ab14ab into 18da5b654e

Add --no-detach to write to a separate debug file in the foreground
--detach is the default.
2024-08-16 00:10:55 +03:00 · 2024-07-09 05:47:42 +01:00 · 2024-07-09 12:06:41 +09:00 · 2024-07-09 10:15:26 +09:00 · 2024-07-08 10:43:40 +09:00 · 2024-07-08 09:59:07 +09:00
14 changed files with 465 additions and 136 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -364,6 +364,7 @@ endforeach()
 # Add other non-template source files.
 target_sources(mold PRIVATE
  common/compress.cc
+  common/crc32.cc
  common/demangle.cc
  common/filepath.cc
  common/glob.cc
--- a/README.md
+++ b/README.md
@ -155,7 +155,7 @@ example, `gcc` is used as the linker driver. Use the `-fuse-ld` option if your
 GCC is recent enough to recognize this option.

 If you want to use mold for all projects, add the above snippet to
-`~/.config/config.nims`.
+`~/.config/nim/config.nims`.

 </details>

--- a/common/common.h
+++ b/common/common.h
@ -2,6 +2,7 @@

 #include "integers.h"

+#include <array>
 #include <atomic>
 #include <bit>
 #include <bitset>
@ -899,6 +900,13 @@ std::optional<std::string_view> demangle_rust(std::string_view name);
 void acquire_global_lock();
 void release_global_lock();

+//
+// crc32.cc
+//
+
+u32 compute_crc32(u32 crc, u8 *buf, i64 len);
+std::vector<u8> crc32_solve(u32 current, u32 desired);
+
 //
 // compress.cc
 //
--- a/common/crc32.cc
+++ b/common/crc32.cc
@ -0,0 +1,60 @@
+#include "common.h"
+
+#include <tbb/parallel_for_each.h>
+#include <zlib.h>
+
+namespace mold {
+
+// This function "forges" a CRC. That is, given the current and a desired
+// CRC32 value, crc32_solve() returns a binary blob to add to the end of
+// the original data to yield the desired CRC. Trailing garbage is ignored
+// by many bianry file formats, so you can create a file with a desired
+// CRC using crc32_solve(). We need it for --separate-debug-file.
+std::vector<u8> crc32_solve(u32 current, u32 desired) {
+  constexpr u32 poly = 0xedb88320;
+  u32 x = ~desired;
+
+  // Each iteration computes x = (x * x^-1) mod poly.
+  for (i64 i = 0; i < 32; i++) {
+    x = std::rotl(x, 1);
+    x ^= (x & 1) * (poly << 1);
+  }
+
+  x ^= ~current;
+
+  std::vector<u8> out(4);
+  out[0] = x;
+  out[1] = x >> 8;
+  out[2] = x >> 16;
+  out[3] = x >> 24;
+  return out;
+}
+
+// Compute a CRC for given data in parallel
+u32 compute_crc32(u32 crc, u8 *buf, i64 len) {
+  struct Shard {
+    u8 *buf;
+    i64 len;
+    u32 crc;
+  };
+
+  constexpr i64 shard_size = 1024 * 1024; // 1 MiB
+  std::vector<Shard> shards;
+
+  while (len > 0) {
+    i64 sz = std::min(len, shard_size);
+    shards.push_back({buf, sz, 0});
+    buf += sz;
+    len -= sz;
+  }
+
+  tbb::parallel_for_each(shards.begin(), shards.end(), [](Shard &shard) {
+    shard.crc = crc32_z(0, shard.buf, shard.len);
+  });
+
+  for (Shard &shard : shards)
+    crc = crc32_combine(crc, shard.crc, shard.len);
+  return crc;
+}
+
+} // namespace mold
--- a/elf/cmdline.cc
+++ b/elf/cmdline.cc
@ -85,6 +85,8 @@ Options:
  --defsym=SYMBOL=VALUE       Define a symbol alias
  --demangle                  Demangle C++ symbols in log messages (default)
    --no-demangle
+  --detach                    Create separate debug info file in the background (default)
+    --no-detach
  --enable-new-dtags          Emit DT_RUNPATH for --rpath (default)
    --disable-new-dtags       Emit DT_RPATH for --rpath
  --execute-only              Make executable segments unreadable
@ -143,6 +145,8 @@ Options:
  --rpath-link DIR            Ignored
  --run COMMAND ARG...        Run COMMAND with mold as /usr/bin/ld
  --section-start=SECTION=ADDR Set address for section
+  --separate-debug-file[=FILE] Separate debug info to the specified file
+    --no-separate-debug-file
  --shared, --Bshareable      Create a shared library
  --shuffle-sections[=SEED]   Randomize the output by shuffling input sections
  --sort-common               Ignored
@ -526,6 +530,7 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
  std::optional<SeparateCodeKind> z_separate_code;
  std::optional<bool> report_undefined;
  std::optional<bool> z_relro;
+  std::optional<std::string> separate_debug_file;
  std::optional<u64> shuffle_sections_seed;
  std::unordered_set<std::string_view> rpaths;

@ -756,6 +761,10 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
      ctx.arg.demangle = true;
    } else if (read_flag("no-demangle")) {
      ctx.arg.demangle = false;
+    } else if (read_flag("detach")) {
+      ctx.arg.detach = true;
+    } else if (read_flag("no-detach")) {
+      ctx.arg.detach = false;
    } else if (read_flag("default-symver")) {
      ctx.arg.default_symver = true;
    } else if (read_flag("noinhibit-exec")) {
@ -1003,6 +1012,12 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
      ctx.arg.z_origin = true;
    } else if (read_z_flag("nodefaultlib")) {
      ctx.arg.z_nodefaultlib = true;
+    } else if (read_eq("separate-debug-file")) {
+      separate_debug_file = arg;
+    } else if (read_flag("separate-debug-file")) {
+      separate_debug_file = "";
+    } else if (read_flag("no-separate-debug-file")) {
+      separate_debug_file.reset();
    } else if (read_z_flag("separate-loadable-segments")) {
      z_separate_code = SEPARATE_LOADABLE_SEGMENTS;
    } else if (read_z_flag("separate-code")) {
@ -1394,9 +1409,20 @@ std::vector<std::string> parse_nonpositional_args(Context<E> &ctx) {
    ctx.default_version = VER_NDX_LAST_RESERVED + 1;
  }

+  if (separate_debug_file) {
+    if (separate_debug_file->empty())
+      ctx.arg.separate_debug_file = ctx.arg.output + ".dbg";
+    else
+      ctx.arg.separate_debug_file = *separate_debug_file;
+  }
+
  if (ctx.arg.shared && warn_shared_textrel)
    ctx.arg.warn_textrel = true;

+  // We don't want the background process to write to stdout
+  if (ctx.arg.stats || ctx.arg.perf)
+    ctx.arg.detach = false;
+
  ctx.arg.undefined.push_back(ctx.arg.entry);

  for (i64 i = 0; i < ctx.arg.defsyms.size(); i++) {
--- a/elf/main.cc
+++ b/elf/main.cc
@ -559,14 +559,17 @@ int elf_main(int argc, char **argv) {
  // Compute the is_weak bit for each imported symbol.
  compute_imported_symbol_weakness(ctx);

-  // Compute sizes of output sections while assigning offsets
-  // within an output section to input sections.
-  compute_section_sizes(ctx);
-
  // Sort sections by section attributes so that we'll have to
  // create as few segments as possible.
  sort_output_sections(ctx);

+  if (!ctx.arg.separate_debug_file.empty())
+    separate_debug_sections(ctx);
+
+  // Compute sizes of output sections while assigning offsets
+  // within an output section to input sections.
+  compute_section_sizes(ctx);
+
  // If --packed_dyn_relocs=relr was given, base relocations are stored
  // to a .relr.dyn section in a compressed form. Construct a compressed
  // relocations now so that we can fix section sizes and file layout.
@ -653,17 +656,18 @@ int elf_main(int argc, char **argv) {
  // .note.gnu.build-id section contains a cryptographic hash of the
  // entire output file. Now that we wrote everything except build-id,
  // we can compute it.
-  if (ctx.buildid) {
-    compute_build_id(ctx);
-    ctx.buildid->copy_buf(ctx);
-  }
+  if (ctx.buildid)
+    write_build_id(ctx);

  // .gdb_index's contents cannot be constructed before applying
  // relocations to other debug sections. We have relocated debug
  // sections now, so write the .gdb_index section.
-  if (ctx.gdb_index)
+  if (ctx.gdb_index && ctx.arg.separate_debug_file.empty())
    write_gdb_index(ctx);

+  if (!ctx.arg.separate_debug_file.empty())
+    write_gnu_debuglink(ctx);
+
  t_copy.stop();
  ctx.checkpoint();

@ -682,6 +686,9 @@ int elf_main(int argc, char **argv) {
  if (ctx.arg.print_map)
    print_map(ctx);

+  if (!ctx.arg.separate_debug_file.empty())
+    write_separate_debug_file(ctx);
+
  // Show stats numbers
  if (ctx.arg.stats)
    show_stats(ctx);
@ -692,9 +699,7 @@ int elf_main(int argc, char **argv) {
  std::cout << std::flush;
  std::cerr << std::flush;

-  if (ctx.arg.fork)
-    notify_parent();
-
+  notify_parent();
  release_global_lock();

  if (ctx.arg.quick_exit)
--- a/elf/mold.h
+++ b/elf/mold.h
@ -373,7 +373,7 @@ public:
  virtual ~Chunk() = default;
  virtual bool is_header() { return false; }
  virtual OutputSection<E> *to_osec() { return nullptr; }
-  virtual MergedSection<E> *to_merged_section() { return nullptr; }
+  virtual void compute_section_size(Context<E> &ctx) {}
  virtual i64 get_reldyn_size(Context<E> &ctx) const { return 0; }
  virtual void construct_relr(Context<E> &ctx) {}
  virtual void copy_buf(Context<E> &ctx) {}
@ -480,6 +480,7 @@ public:
  }

  OutputSection<E> *to_osec() override { return this; }
+  void compute_section_size(Context<E> &ctx) override;
  void construct_relr(Context<E> &ctx) override;
  void copy_buf(Context<E> &ctx) override;
  void write_to(Context<E> &ctx, u8 *buf) override;
@ -806,9 +807,8 @@ public:
  SectionFragment<E> *insert(Context<E> &ctx, std::string_view data,
                             u64 hash, i64 p2align);

-  MergedSection<E> *to_merged_section() override { return this; }
  void resolve(Context<E> &ctx);
-  void assign_offsets(Context<E> &ctx);
+  void compute_section_size(Context<E> &ctx) override;
  void copy_buf(Context<E> &ctx) override;
  void write_to(Context<E> &ctx, u8 *buf) override;
  void print_stats(Context<E> &ctx);
@ -993,6 +993,22 @@ private:
  std::map<u32, u32> properties;
 };

+template <typename E>
+class GnuDebuglinkSection : public Chunk<E> {
+public:
+  GnuDebuglinkSection() {
+    this->name = ".gnu_debuglink";
+    this->shdr.sh_type = SHT_PROGBITS;
+    this->shdr.sh_addralign = 4;
+  }
+
+  void update_shdr(Context<E> &ctx) override;
+  void copy_buf(Context<E> &ctx) override;
+
+  std::string filename;
+  u32 crc32 = 0;
+};
+
 template <typename E>
 class GdbIndexSection : public Chunk<E> {
 public:
@ -1439,11 +1455,14 @@ template <typename E> void apply_version_script(Context<E> &);
 template <typename E> void parse_symbol_version(Context<E> &);
 template <typename E> void compute_import_export(Context<E> &);
 template <typename E> void compute_address_significance(Context<E> &);
+template <typename E> void separate_debug_sections(Context<E> &);
 template <typename E> void compute_section_headers(Context<E> &);
 template <typename E> i64 set_osec_offsets(Context<E> &);
 template <typename E> void fix_synthetic_symbols(Context<E> &);
 template <typename E> i64 compress_debug_sections(Context<E> &);
-template <typename E> void compute_build_id(Context<E> &);
+template <typename E> void write_build_id(Context<E> &);
+template <typename E> void write_gnu_debuglink(Context<E> &);
+template <typename E> void write_separate_debug_file(Context<E> &ctx);
 template <typename E> void write_dependency_file(Context<E> &);
 template <typename E> void show_stats(Context<E> &);

@ -1721,6 +1740,7 @@ struct Context {
    bool color_diagnostics = false;
    bool default_symver = false;
    bool demangle = true;
+    bool detach = true;
    bool discard_all = false;
    bool discard_locals = false;
    bool eh_frame_hdr = true;
@ -1807,6 +1827,7 @@ struct Context {
    std::string package_metadata;
    std::string plugin;
    std::string rpaths;
+    std::string separate_debug_file;
    std::string soname;
    std::string sysroot;
    std::unique_ptr<std::unordered_set<std::string_view>> retain_symbols_file;
@ -1885,6 +1906,9 @@ struct Context {

  tbb::concurrent_hash_map<Symbol<E> *, std::vector<std::string>> undef_errors;

+  // For --separate-debug-file
+  std::vector<Chunk<E> *> debug_chunks;
+
  // Output chunks
  OutputEhdr<E> *ehdr = nullptr;
  OutputShdr<E> *shdr = nullptr;
@ -1900,6 +1924,7 @@ struct Context {
  DynstrSection<E> *dynstr = nullptr;
  HashSection<E> *hash = nullptr;
  GnuHashSection<E> *gnu_hash = nullptr;
+  GnuDebuglinkSection<E> *gnu_debuglink = nullptr;
  ShstrtabSection<E> *shstrtab = nullptr;
  PltSection<E> *plt = nullptr;
  PltGotSection<E> *pltgot = nullptr;
--- a/elf/output-chunks.cc
+++ b/elf/output-chunks.cc
@ -865,6 +865,84 @@ void DynamicSection<E>::copy_buf(Context<E> &ctx) {
  write_vector(ctx.buf + this->shdr.sh_offset, contents);
 }

+template <typename T>
+static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
+  std::span<T> span(input);
+  std::vector<std::span<T>> vec;
+
+  while (span.size() >= unit) {
+    vec.push_back(span.subspan(0, unit));
+    span = span.subspan(unit);
+  }
+  if (!span.empty())
+    vec.push_back(span);
+  return vec;
+}
+
+
+// Assign offsets to OutputSection members
+template <typename E>
+void OutputSection<E>::compute_section_size(Context<E> &ctx) {
+  ElfShdr<E> &shdr = this->shdr;
+
+  // On most RISC systems, we need to create so-called "range extension
+  // thunks" to extend branch instructions reach, as their jump
+  // instructions' reach is limited. create_range_extension_thunks()
+  // computes the size of the section while inserting thunks.
+  if constexpr (needs_thunk<E>) {
+    if ((shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable) {
+      create_range_extension_thunks(ctx);
+      return;
+    }
+  }
+
+  // Since one output section may contain millions of input sections,
+  // we first split input sections into groups and assign offsets to
+  // groups.
+  struct Group {
+    std::span<InputSection<E> *> members;
+    i64 size = 0;
+    i64 p2align = 0;
+    i64 offset = 0;
+  };
+
+  std::span<InputSection<E> *> mem = members;
+  std::vector<Group> groups;
+  constexpr i64 group_size = 10000;
+
+  while (!mem.empty()) {
+    i64 sz = std::min<i64>(group_size, mem.size());
+    groups.push_back({mem.subspan(0, sz)});
+    mem = mem.subspan(sz);
+  }
+
+  tbb::parallel_for_each(groups, [](Group &group) {
+    for (InputSection<E> *isec : group.members) {
+      group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
+      group.p2align = std::max<i64>(group.p2align, isec->p2align);
+    }
+  });
+
+  shdr.sh_size = 0;
+
+  for (i64 i = 0; i < groups.size(); i++) {
+    shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
+    groups[i].offset = shdr.sh_size;
+    shdr.sh_size += groups[i].size;
+    shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
+  }
+
+  // Assign offsets to input sections.
+  tbb::parallel_for_each(groups, [](Group &group) {
+    i64 offset = group.offset;
+    for (InputSection<E> *isec : group.members) {
+      offset = align_to(offset, 1 << isec->p2align);
+      isec->offset = offset;
+      offset += isec->sh_size;
+    }
+  });
+}
+
 template <typename E>
 void OutputSection<E>::copy_buf(Context<E> &ctx) {
  if (this->shdr.sh_type != SHT_NOBITS)
@ -1621,10 +1699,14 @@ ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
  };

  i64 shndx = -1;
+  InputSection<E> *isec = sym.get_input_section();
+
  if (sym.has_copyrel) {
+    // Symbol in .copyrel
    shndx = sym.is_copyrel_readonly ? ctx.copyrel_relro->shndx : ctx.copyrel->shndx;
    esym.st_value = sym.get_addr(ctx);
  } else if (sym.file->is_dso || sym.esym().is_undef()) {
+    // Undefined symbol in a DSO
    esym.st_shndx = SHN_UNDEF;
    esym.st_size = 0;
    if (sym.is_canonical)
@ -1637,7 +1719,7 @@ ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
    // Section fragment
    shndx = frag->output_section.shndx;
    esym.st_value = sym.get_addr(ctx);
-  } else if (!sym.get_input_section()) {
+  } else if (!isec) {
    // Absolute symbol
    esym.st_shndx = SHN_ABS;
    esym.st_value = sym.get_addr(ctx);
@ -1651,7 +1733,25 @@ ElfSym<E> to_output_esym(Context<E> &ctx, Symbol<E> &sym, u32 st_name,
    esym.st_type = STT_FUNC;
    esym.st_visibility = sym.visibility;
    esym.st_value = sym.get_plt_addr(ctx);
+  } else if (!isec->output_section) {
+    // Symbol in a mergeable non-SHF_ALLOC section, such as .debug_str
+    assert(!(isec->shdr().sh_flags & SHF_ALLOC));
+    assert(isec->shdr().sh_flags & SHF_MERGE);
+    assert(!sym.file->is_dso);
+
+    ObjectFile<E> *file = (ObjectFile<E> *)sym.file;
+    MergeableSection<E> *m =
+      file->mergeable_sections[file->get_shndx(sym.esym())].get();
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = m->get_fragment(sym.esym().st_value);
+
+    shndx = m->parent.shndx;
+    esym.st_visibility = sym.visibility;
+    esym.st_value = frag->get_addr(ctx) + frag_addend;
  } else {
+    // Symbol in a regular section
    shndx = get_st_shndx(sym);
    esym.st_visibility = sym.visibility;
    esym.st_value = sym.get_addr(ctx, NO_PLT);
@ -1987,7 +2087,7 @@ void MergedSection<E>::resolve(Context<E> &ctx) {
 }

 template <typename E>
-void MergedSection<E>::assign_offsets(Context<E> &ctx) {
+void MergedSection<E>::compute_section_size(Context<E> &ctx) {
  if (!resolved)
    resolve(ctx);

@ -2848,6 +2948,20 @@ void ComdatGroupSection<E>::copy_buf(Context<E> &ctx) {
    *buf++ = chunk->shndx;
 }

+template <typename E>
+void GnuDebuglinkSection<E>::update_shdr(Context<E> &ctx) {
+  filename = std::filesystem::path(ctx.arg.separate_debug_file).filename().string();
+  this->shdr.sh_size = align_to(filename.size() + 1, 4) + 4;
+}
+
+template <typename E>
+void GnuDebuglinkSection<E>::copy_buf(Context<E> &ctx) {
+  u8 *buf = ctx.buf + this->shdr.sh_offset;
+  memset(buf, 0, this->shdr.sh_size);
+  write_string(buf, filename);
+  *(U32<E> *)(buf + this->shdr.sh_size - 4) = crc32;
+}
+
 using E = MOLD_TARGET;

 template class Chunk<E>;
@ -2886,6 +3000,7 @@ template class GdbIndexSection<E>;
 template class CompressedSection<E>;
 template class RelocSection<E>;
 template class ComdatGroupSection<E>;
+template class GnuDebuglinkSection<E>;

 template OutputSection<E> *find_section(Context<E> &, u32);
 template OutputSection<E> *find_section(Context<E> &, std::string_view);
--- a/elf/passes.cc
+++ b/elf/passes.cc
@ -156,6 +156,8 @@ void create_synthetic_sections(Context<E> &ctx) {
    ctx.verdef = push(new VerdefSection<E>);
  if (ctx.arg.emit_relocs)
    ctx.eh_frame_reloc = push(new EhFrameRelocSection<E>);
+  if (!ctx.arg.separate_debug_file.empty())
+    ctx.gnu_debuglink = push(new GnuDebuglinkSection<E>);

  if (ctx.arg.shared || !ctx.dsos.empty() || ctx.arg.pie) {
    ctx.dynamic = push(new DynamicSection<E>(ctx));
@ -466,20 +468,6 @@ static std::string get_cmdline_args(Context<E> &ctx) {
  return ss.str();
 }

-template <typename T>
-static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
-  std::span<T> span(input);
-  std::vector<std::span<T>> vec;
-
-  while (span.size() >= unit) {
-    vec.push_back(span.subspan(0, unit));
-    span = span.subspan(unit);
-  }
-  if (!span.empty())
-    vec.push_back(span);
-  return vec;
-}
-
 template <typename E>
 static bool has_ctors_and_init_array(Context<E> &ctx) {
  bool x = false;
@ -1351,84 +1339,24 @@ template <typename E>
 void compute_section_sizes(Context<E> &ctx) {
  Timer t(ctx, "compute_section_sizes");

-  struct Group {
-    i64 size = 0;
-    i64 p2align = 0;
-    i64 offset = 0;
-    std::span<InputSection<E> *> members;
-  };
-
-  // Assign offsets to OutputSection members
-  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
-    OutputSection<E> *osec = chunk->to_osec();
-    if (!osec)
-      return;
-
-    // This pattern will be processed in the next loop.
-    if constexpr (needs_thunk<E>)
-      if ((osec->shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable)
-        return;
-
-    // Since one output section may contain millions of input sections,
-    // we first split input sections into groups and assign offsets to
-    // groups.
-    std::vector<Group> groups;
-    constexpr i64 group_size = 10000;
-
-    for (std::span<InputSection<E> *> span : split(osec->members, group_size))
-      groups.push_back(Group{.members = span});
-
-    tbb::parallel_for_each(groups, [](Group &group) {
-      for (InputSection<E> *isec : group.members) {
-        group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
-        group.p2align = std::max<i64>(group.p2align, isec->p2align);
-      }
-    });
-
-    ElfShdr<E> &shdr = osec->shdr;
-    shdr.sh_size = 0;
-
-    for (i64 i = 0; i < groups.size(); i++) {
-      shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
-      groups[i].offset = shdr.sh_size;
-      shdr.sh_size += groups[i].size;
-      shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
-    }
-
-    // Assign offsets to input sections.
-    tbb::parallel_for_each(groups, [](Group &group) {
-      i64 offset = group.offset;
-      for (InputSection<E> *isec : group.members) {
-        offset = align_to(offset, 1 << isec->p2align);
-        isec->offset = offset;
-        offset += isec->sh_size;
-      }
-    });
-  });
-
-
-  // Assign offsets to MergedSection members
-  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
-    if (MergedSection<E> *sec = chunk->to_merged_section())
-      sec->assign_offsets(ctx);
-  });
-
-  // On ARM32 or ARM64, we may need to create so-called "range extension
-  // thunks" to extend branch instructions reach, as they can jump only
-  // to ±16 MiB or ±128 MiB, respecitvely.
-  //
-  // In the following loop, We compute the sizes of sections while
-  // inserting thunks. This pass cannot be parallelized. That is,
-  // create_range_extension_thunks is parallelized internally, but the
-  // function itself is not thread-safe.
  if constexpr (needs_thunk<E>) {
-    Timer t(ctx, "create_range_extension_thunks");
+    // Chunk<E>::compute_section_size may obtain a global lock to create
+    // range extension thunks. I don't know why, but using parallel_for
+    // loop both inside and outside of the lock may cause a deadlock. It
+    // might be a bug in TBB. For now, I'll avoid using parallel_for_each
+    // here.
+    for (Chunk<E> *chunk : ctx.chunks)
+      if (chunk->shdr.sh_flags & SHF_EXECINSTR)
+        chunk->compute_section_size(ctx);

-    if (!ctx.arg.relocatable)
-      for (Chunk<E> *chunk : ctx.chunks)
-        if (OutputSection<E> *osec = chunk->to_osec())
-          if (osec->shdr.sh_flags & SHF_EXECINSTR)
-            osec->create_range_extension_thunks(ctx);
+    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+      if (!(chunk->shdr.sh_flags & SHF_EXECINSTR))
+        chunk->compute_section_size(ctx);
+    });
+  } else {
+    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+      chunk->compute_section_size(ctx);
+    });
  }
 }

@ -2676,6 +2604,24 @@ static i64 set_file_offsets(Context<E> &ctx) {
  return fileoff;
 }

+// Remove debug sections from ctx.chunks and save them to ctx.debug_chunks.
+// This is for --separate-debug-file.
+template <typename E>
+void separate_debug_sections(Context<E> &ctx) {
+  auto is_debug_section = [&](Chunk<E> *chunk) {
+    if (chunk->shdr.sh_flags & SHF_ALLOC)
+      return false;
+    return chunk == ctx.gdb_index || chunk == ctx.symtab || chunk == ctx.strtab ||
+           chunk->name.starts_with(".debug_");
+  };
+
+  auto mid = std::stable_partition(ctx.chunks.begin(), ctx.chunks.end(),
+                                   is_debug_section);
+
+  ctx.debug_chunks = {ctx.chunks.begin(), mid};
+  ctx.chunks.erase(ctx.chunks.begin(), mid);
+}
+
 template <typename E>
 void compute_section_headers(Context<E> &ctx) {
  // Update sh_size for each chunk.
@ -3006,23 +2952,34 @@ static void blake3_hash(u8 *buf, i64 size, u8 *out) {
 }

 template <typename E>
-void compute_build_id(Context<E> &ctx) {
-  Timer t(ctx, "compute_build_id");
+std::vector<std::span<u8>> get_shards(Context<E> &ctx) {
+  constexpr i64 shard_size = 4 * 1024 * 1024; // 4 MiB
+  std::span<u8> buf = {ctx.buf, (size_t)ctx.output_file->filesize};
+  std::vector<std::span<u8>> vec;
+
+  while (!buf.empty()) {
+    i64 sz = std::min<i64>(shard_size, buf.size());
+    vec.push_back(buf.subspan(0, sz));
+    buf = buf.subspan(sz);
+  }
+  return vec;
+}
+
+template <typename E>
+void write_build_id(Context<E> &ctx) {
+  Timer t(ctx, "write_build_id");

  switch (ctx.arg.build_id.kind) {
  case BuildId::HEX:
    ctx.buildid->contents = ctx.arg.build_id.value;
    break;
  case BuildId::HASH: {
-    i64 shard_size = 4 * 1024 * 1024;
-    i64 filesize = ctx.output_file->filesize;
-    i64 num_shards = align_to(filesize, shard_size) / shard_size;
-    std::vector<u8> shards(num_shards * BLAKE3_OUT_LEN);
+    std::vector<std::span<u8>> shards = get_shards(ctx);
+    std::vector<u8> hashes(shards.size() * BLAKE3_OUT_LEN);

-    tbb::parallel_for((i64)0, num_shards, [&](i64 i) {
-      u8 *begin = ctx.buf + shard_size * i;
-      u8 *end = (i == num_shards - 1) ? ctx.buf + filesize : begin + shard_size;
-      blake3_hash(begin, end - begin, shards.data() + i * BLAKE3_OUT_LEN);
+    tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) {
+      blake3_hash(shards[i].data(), shards[i].size(),
+                  hashes.data() + i * BLAKE3_OUT_LEN);

 #ifdef HAVE_MADVISE
      // Make the kernel page out the file contents we've just written
@ -3033,7 +2990,7 @@ void compute_build_id(Context<E> &ctx) {
    });

    u8 buf[BLAKE3_OUT_LEN];
-    blake3_hash(shards.data(), shards.size(), buf);
+    blake3_hash(hashes.data(), hashes.size(), buf);

    assert(ctx.arg.build_id.size() <= BLAKE3_OUT_LEN);
    ctx.buildid->contents = {buf, buf + ctx.arg.build_id.size()};
@ -3052,8 +3009,109 @@ void compute_build_id(Context<E> &ctx) {
  default:
    unreachable();
  }
+
+  ctx.buildid->copy_buf(ctx);
 }

+// A .gnu_debuglink section contains a filename and a CRC32 checksum of a
+// debug info file. When we are writing a .gnu_debuglink, we don't know
+// its CRC32 checksum because we haven't created a debug info file. So we
+// write a dummy value instead.
+//
+// We can't choose a random value as a dummy value for build
+// reproducibility. We also don't want to write a fixed value for all
+// files because the CRC checksum is in this section to prevent using
+// wrong file on debugging. gdb rejects a debug info file if its CRC
+// doesn't match with the one in .gdb_debuglink.
+//
+// Therefore, we'll try to make our CRC checksum as unique as possible.
+// We'll remember that checksum, and after creating a debug info file, add
+// a few bytes of garbage at the end of it so that the debug info file's
+// CRC checksum becomes the one that we have precomputed.
+template <typename E>
+void write_gnu_debuglink(Context<E> &ctx) {
+  Timer t(ctx, "write_gnu_debuglink");
+  u32 crc32;
+
+  if (ctx.buildid) {
+    crc32 = compute_crc32(0, ctx.buildid->contents.data(),
+                          ctx.buildid->contents.size());
+  } else {
+    std::vector<std::span<u8>> shards = get_shards(ctx);
+    std::vector<U64<E>> hashes(shards.size());
+
+    tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) {
+      hashes[i] = hash_string({(char *)shards[i].data(), shards[i].size()});
+    });
+    crc32 = compute_crc32(0, (u8 *)hashes.data(), hashes.size() * 8);
+  }
+
+  ctx.gnu_debuglink->crc32 = crc32;
+  ctx.gnu_debuglink->copy_buf(ctx);
+}
+
+// Write a separate debug file. This function is called after we finish
+// writing to the usual output file.
+template <typename E>
+void write_separate_debug_file(Context<E> &ctx) {
+  Timer t(ctx, "write_separate_debug_file");
+
+  // We want to write to the debug info file in background so that the
+  // user doesn't have to wait for it to complete.
+  if (ctx.arg.detach)
+    notify_parent();
+
+  // A debug info file contains all sections as the original file, though
+  // most of them can be empty as if they were bss sections. We convert
+  // real sections into dummy sections here.
+  for (i64 i = 0; i < ctx.chunks.size(); i++) {
+    Chunk<E> *chunk = ctx.chunks[i];
+    if (chunk != ctx.ehdr && chunk != ctx.shdr && chunk != ctx.shstrtab &&
+        chunk->shdr.sh_type != SHT_NOTE) {
+      Chunk<E> *sec = new OutputSection<E>(chunk->name, SHT_NULL);
+      sec->shdr = chunk->shdr;
+      sec->shdr.sh_type = SHT_NOBITS;
+
+      ctx.chunks[i] = sec;
+      ctx.chunk_pool.emplace_back(sec);
+    }
+  }
+
+  // Restore debug info sections that had been set aside while we were
+  // creating the main file.
+  tbb::parallel_for_each(ctx.debug_chunks, [&](Chunk<E> *chunk) {
+    chunk->compute_section_size(ctx);
+  });
+
+  append(ctx.chunks, ctx.debug_chunks);
+
+  // Write to the debug info file as if it were a regular output file.
+  compute_section_headers(ctx);
+  i64 filesize = set_osec_offsets(ctx);
+
+  ctx.output_file =
+    OutputFile<Context<E>>::open(ctx, ctx.arg.separate_debug_file,
+                                 filesize, 0666);
+  ctx.buf = ctx.output_file->buf;
+
+  copy_chunks(ctx);
+
+  if (ctx.gdb_index)
+    write_gdb_index(ctx);
+
+  // Reverse-compute a CRC32 value so that the CRC32 checksum embedded to
+  // the .gnu_debuglink section in the main executable matches with the
+  // debug info file's CRC32 checksum.
+  u32 crc = compute_crc32(0, ctx.buf, filesize);
+
+  std::vector<u8> &buf2 = ctx.output_file->buf2;
+  if (!buf2.empty())
+    crc = compute_crc32(crc, buf2.data(), buf2.size());
+
+  std::vector<u8> trailer = crc32_solve(crc, ctx.gnu_debuglink->crc32);
+  append(ctx.output_file->buf2, trailer);
+  ctx.output_file->close(ctx);
+}

 // Write Makefile-style dependency rules to a file specified by
 // --dependency-file. This is analogous to the compiler's -M flag.
@ -3188,11 +3246,14 @@ template void apply_version_script(Context<E> &);
 template void parse_symbol_version(Context<E> &);
 template void compute_import_export(Context<E> &);
 template void compute_address_significance(Context<E> &);
+template void separate_debug_sections(Context<E> &);
 template void compute_section_headers(Context<E> &);
 template i64 set_osec_offsets(Context<E> &);
 template void fix_synthetic_symbols(Context<E> &);
 template i64 compress_debug_sections(Context<E> &);
-template void compute_build_id(Context<E> &);
+template void write_build_id(Context<E> &);
+template void write_gnu_debuglink(Context<E> &);
+template void write_separate_debug_file(Context<E> &);
 template void write_dependency_file(Context<E> &);
 template void show_stats(Context<E> &);

--- a/elf/subprocess-unix.cc
+++ b/elf/subprocess-unix.cc
@ -60,6 +60,7 @@ void notify_parent() {
  char buf[] = {1};
  [[maybe_unused]] int n = write(pipe_write_fd, buf, 1);
  assert(n == 1);
+  pipe_write_fd = -1;
 }
 #endif

--- a/elf/thunks.cc
+++ b/elf/thunks.cc
@ -170,6 +170,10 @@ static void scan_rels(Context<E> &ctx, InputSection<E> &isec,

 template <>
 void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
+  // This function is not thread-safe because it mutates symbols' members
+  static std::mutex mu;
+  std::scoped_lock lock(mu);
+
  std::span<InputSection<E> *> m = members;
  if (m.empty())
    return;
@ -247,10 +251,8 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {

    // Scan relocations between B and C to collect symbols that need
    // entries in the new thunk.
-    tbb::parallel_for_each(m.begin() + b, m.begin() + c,
-                           [&](InputSection<E> *isec) {
-      scan_rels(ctx, *isec, *thunk, thunk_idx);
-    });
+    for (i64 i = b; i < c; i++)
+      scan_rels(ctx, *m[i], *thunk, thunk_idx);

    // Now that we know the number of symbols in the thunk, we can compute
    // the thunk's size.
@ -270,16 +272,15 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
    }

    // Scan relocations again to fix symbol offsets in the last thunk.
-    tbb::parallel_for_each(m.begin() + b, m.begin() + c,
-                           [&](InputSection<E> *isec) {
-      std::span<Symbol<E> *> syms = isec->file.symbols;
-      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
-      std::span<ThunkRef> thunk_refs = isec->extra.thunk_refs;
+    for (i64 i = b; i < c; i++) {
+      std::span<Symbol<E> *> syms = m[i]->file.symbols;
+      std::span<const ElfRel<E>> rels = m[i]->get_rels(ctx);
+      std::span<ThunkRef> thunk_refs = m[i]->extra.thunk_refs;

-      for (i64 i = 0; i < rels.size(); i++)
-        if (thunk_refs[i].thunk_idx == thunk_idx)
-          thunk_refs[i].sym_idx = syms[rels[i].r_sym]->extra.thunk_sym_idx;
-    });
+      for (i64 j = 0; j < rels.size(); j++)
+        if (thunk_refs[j].thunk_idx == thunk_idx)
+          thunk_refs[j].sym_idx = syms[rels[j].r_sym]->extra.thunk_sym_idx;
+    }

    // Move B forward to point to the begining of the next batch.
    b = c;
--- a/test/elf/separate-debug-file.sh
+++ b/test/elf/separate-debug-file.sh
@ -0,0 +1,26 @@
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+on_qemu && skip
+command -v gdb >& /dev/null || skip
+command -v flock >& /dev/null || skip
+
+cat <<EOF > $t/a.c
+#include <stdio.h>
+int main() {
+  printf("Hello world\n");
+}
+EOF
+
+$CC -c -o $t/a.o $t/a.c -g
+$CC -B. -o $t/exe1 $t/a.o -Wl,--separate-debug-file
+readelf -SW $t/exe1 | grep -Fq .gnu_debuglink
+
+$CC -c -o $t/a.o $t/a.c -g
+$CC -B. -o $t/exe2 $t/a.o -Wl,--separate-debug-file -Wl,--no-build-id
+readelf -SW $t/exe2 | grep -Fq .gnu_debuglink
+
+sleep 1
+
+gdb $t/exe1 -ex 'list main' -ex 'quit' | grep -Fq printf
+gdb $t/exe2 -ex 'list main' -ex 'quit' | grep -Fq printf
--- a/test/elf/x86_64_note.sh
+++ b/test/elf/x86_64_note.sh
@ -37,5 +37,5 @@ grep -Eq '.note.baz\s+NOTE.+000008 00   A  0   0  8' $t/log
 grep -Eq '.note.nonalloc\s+NOTE.+000008 00      0   0  1' $t/log

 readelf --segments $t/exe > $t/log
-grep -Fq '01     .note.baz .note.foo .note.bar' $t/log
+grep -Fq '01     .note.bar .note.baz .note.foo' $t/log
 ! grep -q 'NOTE.*0x0000000000000000 0x0000000000000000' $t/log || false
--- a/test/elf/x86_64_note2.sh
+++ b/test/elf/x86_64_note2.sh
@ -29,4 +29,4 @@ EOF
 ./mold -o $t/exe $t/a.o $t/b.o $t/c.o $t/d.o

 readelf --segments $t/exe > $t/log
-grep -Fq '01     .note.a .note.c .note.b' $t/log
+grep -Fq '01     .note.a .note.b .note.c' $t/log
Author	SHA1	Message	Date
Jake Leahy	6c305541e6	Merge `9ab4ab14ab` into `18da5b654e`	2024-07-09 05:47:42 +01:00
Rui Ueyama	18da5b654e	Add --no-detach to write to a separate debug file in the foreground --detach is the default.	2024-07-09 12:06:41 +09:00
Rui Ueyama	97a1e218c5	Simplify crc32_solve() The code was originally written by Pete Cawley https://gist.github.com/corsix/bdfc8f2f1dc0f28de39f74de9bf4f060	2024-07-09 10:15:26 +09:00
Rui Ueyama	f9e4cb1a7f	Add a missing #include	2024-07-08 10:43:40 +09:00
Rui Ueyama	60760a892a	Attempt to fix CI	2024-07-08 09:59:07 +09:00
Rui Ueyama	596ffa959a	Add --separate-debug-info This option is to separate debug info to a different file. The debug info file's filename is stored to the main output file's .gnu_debuglink section. gdb can read the section contents and followg the link to find debug info in another file. Fixes https://github.com/rui314/mold/issues/1294	2024-07-08 09:28:32 +09:00
Rui Ueyama	cd3b817f13	Make notify_parent idempotent	2024-07-07 22:49:35 +09:00
Rui Ueyama	19de40fed4	Refactor	2024-07-07 22:45:17 +09:00
Rui Ueyama	e78e12b15b	Refactor	2024-07-07 22:24:42 +09:00
Rui Ueyama	5b4377842b	Fix CI	2024-07-07 21:50:53 +09:00
Jake Leahy	9ab4ab14ab	Correct the location of the global config Correct path is listed [here](https://nim-lang.org/docs/nimc.html#compiler-usage-configuration-files)	2023-06-19 17:47:26 +10:00