Make notify_parent idempotent

Refactor
2024-10-05 17:17:40 +03:00 · 2024-07-07 14:36:28 +09:00 · 2024-07-07 14:36:28 +09:00 · 2024-07-07 14:36:28 +09:00 · 2024-07-06 13:11:09 +09:00
9 changed files with 300 additions and 266 deletions
--- a/elf/input-files.cc
+++ b/elf/input-files.cc
@ -677,6 +677,29 @@ void ObjectFile<E>::sort_relocations(Context<E> &ctx) {
  }
 }

+template <typename E>
+void ObjectFile<E>::convert_mergeable_sections(Context<E> &ctx) {
+  // Convert InputSections to MergeableSections
+  for (i64 i = 0; i < this->sections.size(); i++) {
+    InputSection<E> *isec = this->sections[i].get();
+    if (!isec || isec->sh_size == 0 || isec->relsec_idx != -1)
+      continue;
+
+    const ElfShdr<E> &shdr = isec->shdr();
+    if (!(shdr.sh_flags & SHF_MERGE))
+      continue;
+
+    MergedSection<E> *parent =
+      MergedSection<E>::get_instance(ctx, isec->name(), shdr);
+
+    if (parent) {
+      this->mergeable_sections[i] =
+        std::make_unique<MergeableSection<E>>(ctx, *parent, this->sections[i]);
+      this->sections[i] = nullptr;
+    }
+  }
+}
+
 // Usually a section is an atomic unit of inclusion or exclusion.
 // Linker doesn't care about its contents. However, if a section is a
 // mergeable section (a section with SHF_MERGE bit set), the linker is
@ -713,54 +736,17 @@ void ObjectFile<E>::sort_relocations(Context<E> &ctx) {
 // section piece in a section, but it doesn't do for any other types
 // of symbols.
 //
-// In mold, we attach symbols to section pieces. If a relocation refers
-// to a section symbol, and that symbol's section is a mergeable one,
-// we create a new dummy symbol for a section piece and redirect the
-// relocation to this new symbol. If a non-section symbol refers to a
-// section piece, the section piece is attached to the symbol.
+// Section garbage collection and Identical Code Folding work on graphs
+// where sections or section pieces are vertices and relocations are
+// edges. To make it easy to handle them, we rewrite symbols and
+// relocations so that each non-absolute symbol always refers to either
+// a non-mergeable section or a section piece.
+//
+// We do that only for SHF_ALLOC sections because GC and ICF work only
+// on memory-allocated sections. Non-memory-allocated mergeable sections
+// are not handled here for performance reasons.
 template <typename E>
-void ObjectFile<E>::initialize_mergeable_sections(Context<E> &ctx) {
-  // Convert InputSections to MergeableSections
-  for (i64 i = 0; i < this->sections.size(); i++) {
-    InputSection<E> *isec = this->sections[i].get();
-    if (!isec || isec->sh_size == 0 || isec->relsec_idx != -1)
-      continue;
-
-    MergedSection<E> *parent =
-      MergedSection<E>::get_instance(ctx, isec->name(), isec->shdr());
-
-    if (parent) {
-      this->mergeable_sections[i] =
-        std::make_unique<MergeableSection<E>>(ctx, *parent, this->sections[i]);
-      this->sections[i] = nullptr;
-    }
-  }
-
-  // Split section contents
-  for (std::unique_ptr<MergeableSection<E>> &sec : mergeable_sections)
-    if (sec)
-      sec->split_contents(ctx);
-}
-
-template <typename E>
-void ObjectFile<E>::resolve_section_pieces(Context<E> &ctx) {
-  for (std::unique_ptr<MergeableSection<E>> &m : mergeable_sections) {
-    if (!m)
-      continue;
-
-    m->fragments.reserve(m->frag_offsets.size());
-
-    for (i64 i = 0; i < m->frag_offsets.size(); i++) {
-      SectionFragment<E> *frag =
-        m->parent.insert(ctx, m->get_contents(i), m->hashes[i], m->p2align);
-      m->fragments.push_back(frag);
-    }
-
-    // Reclaim memory as we'll never use this vector again
-    m->hashes.clear();
-    m->hashes.shrink_to_fit();
-  }
-
+void ObjectFile<E>::reattach_section_pieces(Context<E> &ctx) {
  // Attach section pieces to symbols.
  for (i64 i = 1; i < this->elf_syms.size(); i++) {
    Symbol<E> &sym = *this->symbols[i];
@ -769,8 +755,9 @@ void ObjectFile<E>::resolve_section_pieces(Context<E> &ctx) {
    if (esym.is_abs() || esym.is_common() || esym.is_undef())
      continue;

-    std::unique_ptr<MergeableSection<E>> &m = mergeable_sections[get_shndx(esym)];
-    if (!m || m->fragments.empty())
+    i64 shndx = get_shndx(esym);
+    std::unique_ptr<MergeableSection<E>> &m = mergeable_sections[shndx];
+    if (!m || !m->parent.resolved)
      continue;

    SectionFragment<E> *frag;
@ -785,17 +772,16 @@ void ObjectFile<E>::resolve_section_pieces(Context<E> &ctx) {
  }

  // Compute the size of frag_syms.
-  std::vector<InputSection<E> *> vec;
-  for (std::unique_ptr<InputSection<E>> &isec : sections)
-    if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC))
-      vec.push_back(isec.get());
-
  i64 nfrag_syms = 0;
-  for (InputSection<E> *isec : vec)
-    for (ElfRel<E> &r : isec->get_rels(ctx))
-      if (const ElfSym<E> &esym = this->elf_syms[r.r_sym];
-          esym.st_type == STT_SECTION && mergeable_sections[get_shndx(esym)])
-        nfrag_syms++;
+  for (std::unique_ptr<InputSection<E>> &isec : sections)
+    if (isec)
+      for (ElfRel<E> &r : isec->get_rels(ctx))
+        if (const ElfSym<E> &esym = this->elf_syms[r.r_sym];
+            esym.st_type == STT_SECTION)
+          if (std::unique_ptr<MergeableSection<E>> &m =
+              mergeable_sections[get_shndx(esym)])
+            if (m->parent.resolved)
+              nfrag_syms++;

  this->frag_syms.resize(nfrag_syms);

@ -803,34 +789,38 @@ void ObjectFile<E>::resolve_section_pieces(Context<E> &ctx) {
  // create a new dummy non-section symbol and redirect the relocation
  // to the newly created symbol.
  i64 idx = 0;
-  for (InputSection<E> *isec : vec) {
-    for (ElfRel<E> &r : isec->get_rels(ctx)) {
-      const ElfSym<E> &esym = this->elf_syms[r.r_sym];
-      if (esym.st_type != STT_SECTION)
-        continue;
+  for (std::unique_ptr<InputSection<E>> &isec : sections) {
+    if (isec) {
+      for (ElfRel<E> &r : isec->get_rels(ctx)) {
+        const ElfSym<E> &esym = this->elf_syms[r.r_sym];
+        if (esym.st_type != STT_SECTION)
+          continue;

-      std::unique_ptr<MergeableSection<E>> &m = mergeable_sections[get_shndx(esym)];
-      if (!m)
-        continue;
+        std::unique_ptr<MergeableSection<E>> &m =
+          mergeable_sections[get_shndx(esym)];

-      i64 r_addend = get_addend(*isec, r);
+        if (!m || !m->parent.resolved)
+          continue;

-      SectionFragment<E> *frag;
-      i64 in_frag_offset;
-      std::tie(frag, in_frag_offset) = m->get_fragment(esym.st_value + r_addend);
+        i64 r_addend = get_addend(*isec, r);

-      if (!frag)
-        Fatal(ctx) << *this << ": bad relocation at " << r.r_sym;
+        SectionFragment<E> *frag;
+        i64 in_frag_offset;
+        std::tie(frag, in_frag_offset) = m->get_fragment(esym.st_value + r_addend);

-      Symbol<E> &sym = this->frag_syms[idx];
-      sym.file = this;
-      sym.set_name("<fragment>");
-      sym.sym_idx = r.r_sym;
-      sym.visibility = STV_HIDDEN;
-      sym.set_frag(frag);
-      sym.value = in_frag_offset - r_addend;
-      r.r_sym = this->elf_syms.size() + idx;
-      idx++;
+        if (!frag)
+          Fatal(ctx) << *this << ": bad relocation at " << r.r_sym;
+
+        Symbol<E> &sym = this->frag_syms[idx];
+        sym.file = this;
+        sym.set_name("<fragment>");
+        sym.sym_idx = r.r_sym;
+        sym.visibility = STV_HIDDEN;
+        sym.set_frag(frag);
+        sym.value = in_frag_offset - r_addend;
+        r.r_sym = this->elf_syms.size() + idx;
+        idx++;
+      }
    }
  }

--- a/elf/input-sections.cc
+++ b/elf/input-sections.cc
@ -557,6 +557,9 @@ MergeableSection<E>::MergeableSection(Context<E> &ctx, MergedSection<E> &parent,
                                      std::unique_ptr<InputSection<E>> &isec)
  : parent(parent), section(std::move(isec)), p2align(section->p2align) {
  section->uncompress(ctx);
+
+  std::scoped_lock lock(parent.mu);
+  parent.members.push_back(this);
 }

 static size_t find_null(std::string_view data, i64 pos, i64 entsize) {
@ -630,6 +633,17 @@ void MergeableSection<E>::split_contents(Context<E> &ctx) {
  counter += frag_offsets.size();
 }

+template <typename E>
+void MergeableSection<E>::resolve_contents(Context<E> &ctx) {
+  fragments.reserve(frag_offsets.size());
+  for (i64 i = 0; i < frag_offsets.size(); i++)
+    fragments.push_back(parent.insert(ctx, get_contents(i), hashes[i], p2align));
+
+  // Reclaim memory as we'll never use this vector again
+  hashes.clear();
+  hashes.shrink_to_fit();
+}
+
 using E = MOLD_TARGET;

 template bool cie_equals(const CieRecord<E> &, const CieRecord<E> &);
--- a/elf/main.cc
+++ b/elf/main.cc
@ -427,10 +427,7 @@ int elf_main(int argc, char **argv) {
  kill_eh_frame_sections(ctx);

  // Split mergeable section contents into section pieces.
-  split_section_pieces(ctx);
-
-  // Resolve mergeable section pieces to merge them.
-  resolve_section_pieces(ctx);
+  create_merged_sections(ctx);

  // Handle --relocatable. Since the linker's behavior is quite different
  // from the normal one when the option is given, the logic is implemented
@ -464,9 +461,6 @@ int elf_main(int argc, char **argv) {
  if (ctx.arg.icf)
    icf_sections(ctx);

-  // Compute sizes of sections containing mergeable strings.
-  compute_merged_section_sizes(ctx);
-
  // Create linker-synthesized sections such as .got or .plt.
  create_synthetic_sections(ctx);

@ -659,10 +653,8 @@ int elf_main(int argc, char **argv) {
  // .note.gnu.build-id section contains a cryptographic hash of the
  // entire output file. Now that we wrote everything except build-id,
  // we can compute it.
-  if (ctx.buildid) {
-    compute_build_id(ctx);
-    ctx.buildid->copy_buf(ctx);
-  }
+  if (ctx.buildid)
+    write_build_id(ctx);

  // .gdb_index's contents cannot be constructed before applying
  // relocations to other debug sections. We have relocated debug
--- a/elf/mold.h
+++ b/elf/mold.h
@ -373,6 +373,7 @@ public:
  virtual ~Chunk() = default;
  virtual bool is_header() { return false; }
  virtual OutputSection<E> *to_osec() { return nullptr; }
+  virtual void compute_section_size(Context<E> &ctx) {}
  virtual i64 get_reldyn_size(Context<E> &ctx) const { return 0; }
  virtual void construct_relr(Context<E> &ctx) {}
  virtual void copy_buf(Context<E> &ctx) {}
@ -479,6 +480,7 @@ public:
  }

  OutputSection<E> *to_osec() override { return this; }
+  void compute_section_size(Context<E> &ctx) override;
  void construct_relr(Context<E> &ctx) override;
  void copy_buf(Context<E> &ctx) override;
  void write_to(Context<E> &ctx, u8 *buf) override;
@ -805,13 +807,18 @@ public:
  SectionFragment<E> *insert(Context<E> &ctx, std::string_view data,
                             u64 hash, i64 p2align);

-  void assign_offsets(Context<E> &ctx);
+  void resolve(Context<E> &ctx);
+  void compute_section_size(Context<E> &ctx) override;
  void copy_buf(Context<E> &ctx) override;
  void write_to(Context<E> &ctx, u8 *buf) override;
  void print_stats(Context<E> &ctx);

+  std::vector<MergeableSection<E> *> members;
+  std::mutex mu;
+
  ConcurrentMap<SectionFragment<E>> map;
  HyperLogLog estimator;
+  bool resolved = false;

 private:
  MergedSection(std::string_view name, i64 flags, i64 type, i64 entsize);
@ -1095,14 +1102,17 @@ public:
                   std::unique_ptr<InputSection<E>> &isec);

  void split_contents(Context<E> &ctx);
+  void resolve_contents(Context<E> &ctx);
  std::pair<SectionFragment<E> *, i64> get_fragment(i64 offset);
  std::string_view get_contents(i64 idx);

  MergedSection<E> &parent;
+  std::vector<SectionFragment<E> *> fragments;
+
+private:
  std::unique_ptr<InputSection<E>> section;
  std::vector<u32> frag_offsets;
  std::vector<u32> hashes;
-  std::vector<SectionFragment<E> *> fragments;
  u8 p2align = 0;
 };

@ -1195,8 +1205,8 @@ public:

  void parse(Context<E> &ctx);
  void initialize_symbols(Context<E> &ctx);
-  void initialize_mergeable_sections(Context<E> &ctx);
-  void resolve_section_pieces(Context<E> &ctx);
+  void convert_mergeable_sections(Context<E> &ctx);
+  void reattach_section_pieces(Context<E> &ctx);
  void resolve_symbols(Context<E> &ctx) override;
  void mark_live_objects(Context<E> &ctx,
                         std::function<void(InputFile<E> *)> feeder) override;
@ -1400,10 +1410,8 @@ template <typename E> void create_synthetic_sections(Context<E> &);
 template <typename E> void set_file_priority(Context<E> &);
 template <typename E> void resolve_symbols(Context<E> &);
 template <typename E> void kill_eh_frame_sections(Context<E> &);
-template <typename E> void split_section_pieces(Context<E> &);
-template <typename E> void resolve_section_pieces(Context<E> &);
+template <typename E> void create_merged_sections(Context<E> &);
 template <typename E> void convert_common_symbols(Context<E> &);
-template <typename E> void compute_merged_section_sizes(Context<E> &);
 template <typename E> void create_output_sections(Context<E> &);
 template <typename E> void add_synthetic_symbols(Context<E> &);
 template <typename E> void apply_section_align(Context<E> &);
@ -1435,7 +1443,7 @@ template <typename E> void compute_section_headers(Context<E> &);
 template <typename E> i64 set_osec_offsets(Context<E> &);
 template <typename E> void fix_synthetic_symbols(Context<E> &);
 template <typename E> i64 compress_debug_sections(Context<E> &);
-template <typename E> void compute_build_id(Context<E> &);
+template <typename E> void write_build_id(Context<E> &);
 template <typename E> void write_dependency_file(Context<E> &);
 template <typename E> void show_stats(Context<E> &);

@ -2353,7 +2361,7 @@ InputSection<E>::get_fragment(Context<E> &ctx, const ElfRel<E> &rel) {
  assert(!(shdr().sh_flags & SHF_ALLOC));

  const ElfSym<E> &esym = file.elf_syms[rel.r_sym];
-  if (esym.st_type == STT_SECTION)
+  if (!esym.is_abs() && !esym.is_common() && !esym.is_undef())
    if (std::unique_ptr<MergeableSection<E>> &m =
        file.mergeable_sections[file.get_shndx(esym)])
      return m->get_fragment(esym.st_value + get_addend(*this, rel));
--- a/elf/output-chunks.cc
+++ b/elf/output-chunks.cc
@ -865,6 +865,84 @@ void DynamicSection<E>::copy_buf(Context<E> &ctx) {
  write_vector(ctx.buf + this->shdr.sh_offset, contents);
 }

+template <typename T>
+static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
+  std::span<T> span(input);
+  std::vector<std::span<T>> vec;
+
+  while (span.size() >= unit) {
+    vec.push_back(span.subspan(0, unit));
+    span = span.subspan(unit);
+  }
+  if (!span.empty())
+    vec.push_back(span);
+  return vec;
+}
+
+
+// Assign offsets to OutputSection members
+template <typename E>
+void OutputSection<E>::compute_section_size(Context<E> &ctx) {
+  ElfShdr<E> &shdr = this->shdr;
+
+  // On most RISC systems, we need to create so-called "range extension
+  // thunks" to extend branch instructions reach, as their jump
+  // instructions' reach is limited. create_range_extension_thunks()
+  // computes the size of the section while inserting thunks.
+  if constexpr (needs_thunk<E>) {
+    if ((shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable) {
+      create_range_extension_thunks(ctx);
+      return;
+    }
+  }
+
+  // Since one output section may contain millions of input sections,
+  // we first split input sections into groups and assign offsets to
+  // groups.
+  struct Group {
+    std::span<InputSection<E> *> members;
+    i64 size = 0;
+    i64 p2align = 0;
+    i64 offset = 0;
+  };
+
+  std::span<InputSection<E> *> mem = members;
+  std::vector<Group> groups;
+  constexpr i64 group_size = 10000;
+
+  while (!mem.empty()) {
+    i64 sz = std::min<i64>(group_size, mem.size());
+    groups.push_back({mem.subspan(0, sz)});
+    mem = mem.subspan(sz);
+  }
+
+  tbb::parallel_for_each(groups, [](Group &group) {
+    for (InputSection<E> *isec : group.members) {
+      group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
+      group.p2align = std::max<i64>(group.p2align, isec->p2align);
+    }
+  });
+
+  shdr.sh_size = 0;
+
+  for (i64 i = 0; i < groups.size(); i++) {
+    shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
+    groups[i].offset = shdr.sh_size;
+    shdr.sh_size += groups[i].size;
+    shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
+  }
+
+  // Assign offsets to input sections.
+  tbb::parallel_for_each(groups, [](Group &group) {
+    i64 offset = group.offset;
+    for (InputSection<E> *isec : group.members) {
+      offset = align_to(offset, 1 << isec->p2align);
+      isec->offset = offset;
+      offset += isec->sh_size;
+    }
+  });
+}
+
 template <typename E>
 void OutputSection<E>::copy_buf(Context<E> &ctx) {
  if (this->shdr.sh_type != SHT_NOBITS)
@ -1971,7 +2049,26 @@ MergedSection<E>::insert(Context<E> &ctx, std::string_view data, u64 hash,
 }

 template <typename E>
-void MergedSection<E>::assign_offsets(Context<E> &ctx) {
+void MergedSection<E>::resolve(Context<E> &ctx) {
+  tbb::parallel_for_each(members, [&](MergeableSection<E> *sec) {
+    sec->split_contents(ctx);
+  });
+
+  // We aim 2/3 occupation ratio
+  map.resize(estimator.get_cardinality() * 3 / 2);
+
+  tbb::parallel_for_each(members, [&](MergeableSection<E> *sec) {
+    sec->resolve_contents(ctx);
+  });
+
+  resolved = true;
+}
+
+template <typename E>
+void MergedSection<E>::compute_section_size(Context<E> &ctx) {
+  if (!resolved)
+    resolve(ctx);
+
  std::vector<i64> sizes(map.NUM_SHARDS);
  Atomic<i64> alignment = 1;

--- a/elf/passes.cc
+++ b/elf/passes.cc
@ -402,25 +402,50 @@ void kill_eh_frame_sections(Context<E> &ctx) {
 }

 template <typename E>
-void split_section_pieces(Context<E> &ctx) {
-  Timer t(ctx, "split_section_pieces");
+void create_merged_sections(Context<E> &ctx) {
+  Timer t(ctx, "create_merged_sections");
+
+  // Convert InputSections to MergeableSections.
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    file->convert_mergeable_sections(ctx);
+  });
+
+  tbb::parallel_for_each(ctx.merged_sections,
+                         [&](std::unique_ptr<MergedSection<E>> &sec) {
+    if (sec->shdr.sh_flags & SHF_ALLOC)
+      sec->resolve(ctx);
+  });

  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    file->initialize_mergeable_sections(ctx);
+    file->reattach_section_pieces(ctx);
  });
-}

-template <typename E>
-void resolve_section_pieces(Context<E> &ctx) {
-  Timer t(ctx, "resolve_section_pieces");
+  // Add strings to .comment
+  if (!ctx.arg.oformat_binary) {
+    ElfShdr<E> shdr = {};
+    shdr.sh_type = SHT_PROGBITS;
+    shdr.sh_flags = SHF_MERGE | SHF_STRINGS;

-  // We aim 2/3 occupation ratio
-  for (std::unique_ptr<MergedSection<E>> &sec : ctx.merged_sections)
-    sec->map.resize(sec->estimator.get_cardinality() * 3 / 2);
+    MergedSection<E> *sec = MergedSection<E>::get_instance(ctx, ".comment", shdr);
+    if (!sec->resolved) {
+      sec->map.resize(4096);
+      sec->resolved = true;
+    }

-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    file->resolve_section_pieces(ctx);
-  });
+    auto add = [&](std::string str) {
+      std::string_view buf = save_string(ctx, str);
+      std::string_view data(buf.data(), buf.size() + 1);
+      sec->insert(ctx, data, hash_string(data), 0);
+    };
+
+    // Add an identification string to .comment.
+    add(get_mold_version());
+
+    // Embed command line arguments for debugging.
+    char *env = getenv("MOLD_DEBUG");
+    if (env && env[0])
+      add("mold command line: " + get_cmdline_args(ctx));
+  }
 }

 template <typename E>
@ -441,55 +466,6 @@ static std::string get_cmdline_args(Context<E> &ctx) {
  return ss.str();
 }

-template <typename E>
-void add_comment_string(Context<E> &ctx, std::string str) {
-  ElfShdr<E> shdr = {};
-  shdr.sh_type = SHT_PROGBITS;
-  shdr.sh_flags = SHF_MERGE | SHF_STRINGS;
-  shdr.sh_entsize = 1;
-  shdr.sh_addralign = 1;
-
-  MergedSection<E> *sec = MergedSection<E>::get_instance(ctx, ".comment", shdr);
-  if (sec->map.nbuckets == 0)
-    sec->map.resize(4096);
-
-  std::string_view buf = save_string(ctx, str);
-  std::string_view data(buf.data(), buf.size() + 1);
-  sec->insert(ctx, data, hash_string(data), 0);
-}
-
-template <typename E>
-void compute_merged_section_sizes(Context<E> &ctx) {
-  Timer t(ctx, "compute_merged_section_sizes");
-
-  // Add an identification string to .comment.
-  if (!ctx.arg.oformat_binary)
-    add_comment_string(ctx, get_mold_version());
-
-  // Embed command line arguments for debugging.
-  if (char *env = getenv("MOLD_DEBUG"); env && env[0])
-    add_comment_string(ctx, "mold command line: " + get_cmdline_args(ctx));
-
-  tbb::parallel_for_each(ctx.merged_sections,
-                         [&](std::unique_ptr<MergedSection<E>> &sec) {
-    sec->assign_offsets(ctx);
-  });
-}
-
-template <typename T>
-static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
-  std::span<T> span(input);
-  std::vector<std::span<T>> vec;
-
-  while (span.size() >= unit) {
-    vec.push_back(span.subspan(0, unit));
-    span = span.subspan(unit);
-  }
-  if (!span.empty())
-    vec.push_back(span);
-  return vec;
-}
-
 template <typename E>
 static bool has_ctors_and_init_array(Context<E> &ctx) {
  bool x = false;
@ -729,8 +705,7 @@ void create_output_sections(Context<E> &ctx) {

  // Add output sections and mergeable sections to ctx.chunks
  for (std::unique_ptr<MergedSection<E>> &osec : ctx.merged_sections)
-    if (osec->shdr.sh_size)
-      chunks.push_back(osec.get());
+    chunks.push_back(osec.get());

  // Sections are added to the section lists in an arbitrary order
  // because they are created in parallel. Sort them to to make the
@ -1362,76 +1337,24 @@ template <typename E>
 void compute_section_sizes(Context<E> &ctx) {
  Timer t(ctx, "compute_section_sizes");

-  struct Group {
-    i64 size = 0;
-    i64 p2align = 0;
-    i64 offset = 0;
-    std::span<InputSection<E> *> members;
-  };
-
-  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
-    OutputSection<E> *osec = chunk->to_osec();
-    if (!osec)
-      return;
-
-    // This pattern will be processed in the next loop.
-    if constexpr (needs_thunk<E>)
-      if ((osec->shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable)
-        return;
-
-    // Since one output section may contain millions of input sections,
-    // we first split input sections into groups and assign offsets to
-    // groups.
-    std::vector<Group> groups;
-    constexpr i64 group_size = 10000;
-
-    for (std::span<InputSection<E> *> span : split(osec->members, group_size))
-      groups.push_back(Group{.members = span});
-
-    tbb::parallel_for_each(groups, [](Group &group) {
-      for (InputSection<E> *isec : group.members) {
-        group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
-        group.p2align = std::max<i64>(group.p2align, isec->p2align);
-      }
-    });
-
-    ElfShdr<E> &shdr = osec->shdr;
-    shdr.sh_size = 0;
-
-    for (i64 i = 0; i < groups.size(); i++) {
-      shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
-      groups[i].offset = shdr.sh_size;
-      shdr.sh_size += groups[i].size;
-      shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
-    }
-
-    // Assign offsets to input sections.
-    tbb::parallel_for_each(groups, [](Group &group) {
-      i64 offset = group.offset;
-      for (InputSection<E> *isec : group.members) {
-        offset = align_to(offset, 1 << isec->p2align);
-        isec->offset = offset;
-        offset += isec->sh_size;
-      }
-    });
-  });
-
-  // On ARM32 or ARM64, we may need to create so-called "range extension
-  // thunks" to extend branch instructions reach, as they can jump only
-  // to ±16 MiB or ±128 MiB, respecitvely.
-  //
-  // In the following loop, We compute the sizes of sections while
-  // inserting thunks. This pass cannot be parallelized. That is,
-  // create_range_extension_thunks is parallelized internally, but the
-  // function itself is not thread-safe.
  if constexpr (needs_thunk<E>) {
-    Timer t2(ctx, "create_range_extension_thunks");
+    // Chunk<E>::compute_section_size may obtain a global lock to create
+    // range extension thunks. I don't know why, but using parallel_for
+    // loop both inside and outside of the lock may cause a deadlock. It
+    // might be a bug in TBB. For now, I'll avoid using parallel_for_each
+    // here.
+    for (Chunk<E> *chunk : ctx.chunks)
+      if (chunk->shdr.sh_flags & SHF_EXECINSTR)
+        chunk->compute_section_size(ctx);

-    if (!ctx.arg.relocatable)
-      for (Chunk<E> *chunk : ctx.chunks)
-        if (OutputSection<E> *osec = chunk->to_osec())
-          if (osec->shdr.sh_flags & SHF_EXECINSTR)
-            osec->create_range_extension_thunks(ctx);
+    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+      if (!(chunk->shdr.sh_flags & SHF_EXECINSTR))
+        chunk->compute_section_size(ctx);
+    });
+  } else {
+    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+      chunk->compute_section_size(ctx);
+    });
  }
 }

@ -3009,23 +2932,34 @@ static void blake3_hash(u8 *buf, i64 size, u8 *out) {
 }

 template <typename E>
-void compute_build_id(Context<E> &ctx) {
-  Timer t(ctx, "compute_build_id");
+std::vector<std::span<u8>> get_shards(Context<E> &ctx) {
+  constexpr i64 shard_size = 4 * 1024 * 1024; // 4 MiB
+  std::span<u8> buf = {ctx.buf, (size_t)ctx.output_file->filesize};
+  std::vector<std::span<u8>> vec;
+
+  while (!buf.empty()) {
+    i64 sz = std::min<i64>(shard_size, buf.size());
+    vec.push_back(buf.subspan(0, sz));
+    buf = buf.subspan(sz);
+  }
+  return vec;
+}
+
+template <typename E>
+void write_build_id(Context<E> &ctx) {
+  Timer t(ctx, "write_build_id");

  switch (ctx.arg.build_id.kind) {
  case BuildId::HEX:
    ctx.buildid->contents = ctx.arg.build_id.value;
    break;
  case BuildId::HASH: {
-    i64 shard_size = 4 * 1024 * 1024;
-    i64 filesize = ctx.output_file->filesize;
-    i64 num_shards = align_to(filesize, shard_size) / shard_size;
-    std::vector<u8> shards(num_shards * BLAKE3_OUT_LEN);
+    std::vector<std::span<u8>> shards = get_shards(ctx);
+    std::vector<u8> hashes(shards.size() * BLAKE3_OUT_LEN);

-    tbb::parallel_for((i64)0, num_shards, [&](i64 i) {
-      u8 *begin = ctx.buf + shard_size * i;
-      u8 *end = (i == num_shards - 1) ? ctx.buf + filesize : begin + shard_size;
-      blake3_hash(begin, end - begin, shards.data() + i * BLAKE3_OUT_LEN);
+    tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) {
+      blake3_hash(shards[i].data(), shards[i].size(),
+                  hashes.data() + i * BLAKE3_OUT_LEN);

 #ifdef HAVE_MADVISE
      // Make the kernel page out the file contents we've just written
@ -3036,7 +2970,7 @@ void compute_build_id(Context<E> &ctx) {
    });

    u8 buf[BLAKE3_OUT_LEN];
-    blake3_hash(shards.data(), shards.size(), buf);
+    blake3_hash(hashes.data(), hashes.size(), buf);

    assert(ctx.arg.build_id.size() <= BLAKE3_OUT_LEN);
    ctx.buildid->contents = {buf, buf + ctx.arg.build_id.size()};
@ -3055,8 +2989,9 @@ void compute_build_id(Context<E> &ctx) {
  default:
    unreachable();
  }
-}

+  ctx.buildid->copy_buf(ctx);
+}

 // Write Makefile-style dependency rules to a file specified by
 // --dependency-file. This is analogous to the compiler's -M flag.
@ -3162,10 +3097,8 @@ template void apply_exclude_libs(Context<E> &);
 template void create_synthetic_sections(Context<E> &);
 template void resolve_symbols(Context<E> &);
 template void kill_eh_frame_sections(Context<E> &);
-template void split_section_pieces(Context<E> &);
-template void resolve_section_pieces(Context<E> &);
+template void create_merged_sections(Context<E> &);
 template void convert_common_symbols(Context<E> &);
-template void compute_merged_section_sizes(Context<E> &);
 template void create_output_sections(Context<E> &);
 template void add_synthetic_symbols(Context<E> &);
 template void check_cet_errors(Context<E> &);
@ -3197,7 +3130,7 @@ template void compute_section_headers(Context<E> &);
 template i64 set_osec_offsets(Context<E> &);
 template void fix_synthetic_symbols(Context<E> &);
 template i64 compress_debug_sections(Context<E> &);
-template void compute_build_id(Context<E> &);
+template void write_build_id(Context<E> &);
 template void write_dependency_file(Context<E> &);
 template void show_stats(Context<E> &);

--- a/elf/relocatable.cc
+++ b/elf/relocatable.cc
@ -148,8 +148,6 @@ static u64 r_set_osec_offsets(Context<E> &ctx) {

 template <typename E>
 void combine_objects(Context<E> &ctx) {
-  compute_merged_section_sizes(ctx);
-
  create_output_sections(ctx);

  r_create_synthetic_sections(ctx);
--- a/elf/subprocess-unix.cc
+++ b/elf/subprocess-unix.cc
@ -60,6 +60,7 @@ void notify_parent() {
  char buf[] = {1};
  [[maybe_unused]] int n = write(pipe_write_fd, buf, 1);
  assert(n == 1);
+  pipe_write_fd = -1;
 }
 #endif

--- a/elf/thunks.cc
+++ b/elf/thunks.cc
@ -170,6 +170,10 @@ static void scan_rels(Context<E> &ctx, InputSection<E> &isec,

 template <>
 void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
+  // This function is not thread-safe because it mutates symbols' members
+  static std::mutex mu;
+  std::scoped_lock lock(mu);
+
  std::span<InputSection<E> *> m = members;
  if (m.empty())
    return;
@ -247,10 +251,8 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {

    // Scan relocations between B and C to collect symbols that need
    // entries in the new thunk.
-    tbb::parallel_for_each(m.begin() + b, m.begin() + c,
-                           [&](InputSection<E> *isec) {
-      scan_rels(ctx, *isec, *thunk, thunk_idx);
-    });
+    for (i64 i = b; i < c; i++)
+      scan_rels(ctx, *m[i], *thunk, thunk_idx);

    // Now that we know the number of symbols in the thunk, we can compute
    // the thunk's size.
@ -270,16 +272,15 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
    }

    // Scan relocations again to fix symbol offsets in the last thunk.
-    tbb::parallel_for_each(m.begin() + b, m.begin() + c,
-                           [&](InputSection<E> *isec) {
-      std::span<Symbol<E> *> syms = isec->file.symbols;
-      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
-      std::span<ThunkRef> thunk_refs = isec->extra.thunk_refs;
+    for (i64 i = b; i < c; i++) {
+      std::span<Symbol<E> *> syms = m[i]->file.symbols;
+      std::span<const ElfRel<E>> rels = m[i]->get_rels(ctx);
+      std::span<ThunkRef> thunk_refs = m[i]->extra.thunk_refs;

-      for (i64 i = 0; i < rels.size(); i++)
-        if (thunk_refs[i].thunk_idx == thunk_idx)
-          thunk_refs[i].sym_idx = syms[rels[i].r_sym]->extra.thunk_sym_idx;
-    });
+      for (i64 j = 0; j < rels.size(); j++)
+        if (thunk_refs[j].thunk_idx == thunk_idx)
+          thunk_refs[j].sym_idx = syms[rels[j].r_sym]->extra.thunk_sym_idx;
+    }

    // Move B forward to point to the begining of the next batch.
    b = c;
Author	SHA1	Message	Date
Rui Ueyama	fa2632cf7f	Make notify_parent idempotent	2024-07-07 14:36:28 +09:00
Rui Ueyama	6ce5a7845b	Refactor	2024-07-07 14:36:28 +09:00
Rui Ueyama	d4cd52ca49	Refactor	2024-07-07 14:36:28 +09:00
Rui Ueyama	8e3679e4e5	Refactor	2024-07-06 13:11:09 +09:00