Refactor

2024-09-11 13:06:59 +03:00 · 2024-07-06 15:02:50 +09:00 · 2024-07-06 15:02:50 +09:00 · e78e12b15b
commit e78e12b15b
parent 5b4377842b
4 changed files with 112 additions and 107 deletions
--- a/elf/mold.h
+++ b/elf/mold.h
@ -373,7 +373,7 @@ public:
  virtual ~Chunk() = default;
  virtual bool is_header() { return false; }
  virtual OutputSection<E> *to_osec() { return nullptr; }
-  virtual MergedSection<E> *to_merged_section() { return nullptr; }
+  virtual void compute_section_size(Context<E> &ctx) {}
  virtual i64 get_reldyn_size(Context<E> &ctx) const { return 0; }
  virtual void construct_relr(Context<E> &ctx) {}
  virtual void copy_buf(Context<E> &ctx) {}
@ -480,6 +480,7 @@ public:
  }

  OutputSection<E> *to_osec() override { return this; }
+  void compute_section_size(Context<E> &ctx) override;
  void construct_relr(Context<E> &ctx) override;
  void copy_buf(Context<E> &ctx) override;
  void write_to(Context<E> &ctx, u8 *buf) override;
@ -806,9 +807,8 @@ public:
  SectionFragment<E> *insert(Context<E> &ctx, std::string_view data,
                             u64 hash, i64 p2align);

-  MergedSection<E> *to_merged_section() override { return this; }
  void resolve(Context<E> &ctx);
-  void assign_offsets(Context<E> &ctx);
+  void compute_section_size(Context<E> &ctx) override;
  void copy_buf(Context<E> &ctx) override;
  void write_to(Context<E> &ctx, u8 *buf) override;
  void print_stats(Context<E> &ctx);
--- a/elf/output-chunks.cc
+++ b/elf/output-chunks.cc
@ -865,6 +865,84 @@ void DynamicSection<E>::copy_buf(Context<E> &ctx) {
  write_vector(ctx.buf + this->shdr.sh_offset, contents);
 }

+template <typename T>
+static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
+  std::span<T> span(input);
+  std::vector<std::span<T>> vec;
+
+  while (span.size() >= unit) {
+    vec.push_back(span.subspan(0, unit));
+    span = span.subspan(unit);
+  }
+  if (!span.empty())
+    vec.push_back(span);
+  return vec;
+}
+
+
+// Assign offsets to OutputSection members
+template <typename E>
+void OutputSection<E>::compute_section_size(Context<E> &ctx) {
+  ElfShdr<E> &shdr = this->shdr;
+
+  // On most RISC systems, we need to create so-called "range extension
+  // thunks" to extend branch instructions reach, as their jump
+  // instructions' reach is limited. create_range_extension_thunks()
+  // computes the size of the section while inserting thunks.
+  if constexpr (needs_thunk<E>) {
+    if ((shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable) {
+      create_range_extension_thunks(ctx);
+      return;
+    }
+  }
+
+  // Since one output section may contain millions of input sections,
+  // we first split input sections into groups and assign offsets to
+  // groups.
+  struct Group {
+    std::span<InputSection<E> *> members;
+    i64 size = 0;
+    i64 p2align = 0;
+    i64 offset = 0;
+  };
+
+  std::span<InputSection<E> *> mem = members;
+  std::vector<Group> groups;
+  constexpr i64 group_size = 10000;
+
+  while (!mem.empty()) {
+    i64 sz = std::min<i64>(group_size, mem.size());
+    groups.push_back({mem.subspan(0, sz)});
+    mem = mem.subspan(sz);
+  }
+
+  tbb::parallel_for_each(groups, [](Group &group) {
+    for (InputSection<E> *isec : group.members) {
+      group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
+      group.p2align = std::max<i64>(group.p2align, isec->p2align);
+    }
+  });
+
+  shdr.sh_size = 0;
+
+  for (i64 i = 0; i < groups.size(); i++) {
+    shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
+    groups[i].offset = shdr.sh_size;
+    shdr.sh_size += groups[i].size;
+    shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
+  }
+
+  // Assign offsets to input sections.
+  tbb::parallel_for_each(groups, [](Group &group) {
+    i64 offset = group.offset;
+    for (InputSection<E> *isec : group.members) {
+      offset = align_to(offset, 1 << isec->p2align);
+      isec->offset = offset;
+      offset += isec->sh_size;
+    }
+  });
+}
+
 template <typename E>
 void OutputSection<E>::copy_buf(Context<E> &ctx) {
  if (this->shdr.sh_type != SHT_NOBITS)
@ -2009,7 +2087,7 @@ void MergedSection<E>::resolve(Context<E> &ctx) {
 }

 template <typename E>
-void MergedSection<E>::assign_offsets(Context<E> &ctx) {
+void MergedSection<E>::compute_section_size(Context<E> &ctx) {
  if (!resolved)
    resolve(ctx);

--- a/elf/passes.cc
+++ b/elf/passes.cc
@ -466,20 +466,6 @@ static std::string get_cmdline_args(Context<E> &ctx) {
  return ss.str();
 }

-template <typename T>
-static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
-  std::span<T> span(input);
-  std::vector<std::span<T>> vec;
-
-  while (span.size() >= unit) {
-    vec.push_back(span.subspan(0, unit));
-    span = span.subspan(unit);
-  }
-  if (!span.empty())
-    vec.push_back(span);
-  return vec;
-}
-
 template <typename E>
 static bool has_ctors_and_init_array(Context<E> &ctx) {
  bool x = false;
@ -1351,84 +1337,24 @@ template <typename E>
 void compute_section_sizes(Context<E> &ctx) {
  Timer t(ctx, "compute_section_sizes");

-  struct Group {
-    i64 size = 0;
-    i64 p2align = 0;
-    i64 offset = 0;
-    std::span<InputSection<E> *> members;
-  };
-
-  // Assign offsets to OutputSection members
-  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
-    OutputSection<E> *osec = chunk->to_osec();
-    if (!osec)
-      return;
-
-    // This pattern will be processed in the next loop.
-    if constexpr (needs_thunk<E>)
-      if ((osec->shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable)
-        return;
-
-    // Since one output section may contain millions of input sections,
-    // we first split input sections into groups and assign offsets to
-    // groups.
-    std::vector<Group> groups;
-    constexpr i64 group_size = 10000;
-
-    for (std::span<InputSection<E> *> span : split(osec->members, group_size))
-      groups.push_back(Group{.members = span});
-
-    tbb::parallel_for_each(groups, [](Group &group) {
-      for (InputSection<E> *isec : group.members) {
-        group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
-        group.p2align = std::max<i64>(group.p2align, isec->p2align);
-      }
-    });
-
-    ElfShdr<E> &shdr = osec->shdr;
-    shdr.sh_size = 0;
-
-    for (i64 i = 0; i < groups.size(); i++) {
-      shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
-      groups[i].offset = shdr.sh_size;
-      shdr.sh_size += groups[i].size;
-      shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
-    }
-
-    // Assign offsets to input sections.
-    tbb::parallel_for_each(groups, [](Group &group) {
-      i64 offset = group.offset;
-      for (InputSection<E> *isec : group.members) {
-        offset = align_to(offset, 1 << isec->p2align);
-        isec->offset = offset;
-        offset += isec->sh_size;
-      }
-    });
-  });
-
-
-  // Assign offsets to MergedSection members
-  tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
-    if (MergedSection<E> *sec = chunk->to_merged_section())
-      sec->assign_offsets(ctx);
-  });
-
-  // On ARM32 or ARM64, we may need to create so-called "range extension
-  // thunks" to extend branch instructions reach, as they can jump only
-  // to ±16 MiB or ±128 MiB, respecitvely.
-  //
-  // In the following loop, We compute the sizes of sections while
-  // inserting thunks. This pass cannot be parallelized. That is,
-  // create_range_extension_thunks is parallelized internally, but the
-  // function itself is not thread-safe.
  if constexpr (needs_thunk<E>) {
-    Timer t(ctx, "create_range_extension_thunks");
+    // Chunk<E>::compute_section_size may obtain a global lock to create
+    // range extension thunks. I don't know why, but using parallel_for
+    // loop both inside and outside of the lock may cause a deadlock. It
+    // might be a bug in TBB. For now, I'll avoid using parallel_for_each
+    // here.
+    for (Chunk<E> *chunk : ctx.chunks)
+      if (chunk->shdr.sh_flags & SHF_EXECINSTR)
+        chunk->compute_section_size(ctx);

-    if (!ctx.arg.relocatable)
-      for (Chunk<E> *chunk : ctx.chunks)
-        if (OutputSection<E> *osec = chunk->to_osec())
-          if (osec->shdr.sh_flags & SHF_EXECINSTR)
-            osec->create_range_extension_thunks(ctx);
+    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+      if (!(chunk->shdr.sh_flags & SHF_EXECINSTR))
+        chunk->compute_section_size(ctx);
+    });
+  } else {
+    tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
+      chunk->compute_section_size(ctx);
+    });
  }
 }

--- a/elf/thunks.cc
+++ b/elf/thunks.cc
@ -170,6 +170,10 @@ static void scan_rels(Context<E> &ctx, InputSection<E> &isec,

 template <>
 void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
+  // This function is not thread-safe because it mutates symbols' members
+  static std::mutex mu;
+  std::scoped_lock lock(mu);
+
  std::span<InputSection<E> *> m = members;
  if (m.empty())
    return;
@ -247,10 +251,8 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {

    // Scan relocations between B and C to collect symbols that need
    // entries in the new thunk.
-    tbb::parallel_for_each(m.begin() + b, m.begin() + c,
-                           [&](InputSection<E> *isec) {
-      scan_rels(ctx, *isec, *thunk, thunk_idx);
-    });
+    for (i64 i = b; i < c; i++)
+      scan_rels(ctx, *m[i], *thunk, thunk_idx);

    // Now that we know the number of symbols in the thunk, we can compute
    // the thunk's size.
@ -270,16 +272,15 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
    }

    // Scan relocations again to fix symbol offsets in the last thunk.
-    tbb::parallel_for_each(m.begin() + b, m.begin() + c,
-                           [&](InputSection<E> *isec) {
-      std::span<Symbol<E> *> syms = isec->file.symbols;
-      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
-      std::span<ThunkRef> thunk_refs = isec->extra.thunk_refs;
+    for (i64 i = b; i < c; i++) {
+      std::span<Symbol<E> *> syms = m[i]->file.symbols;
+      std::span<const ElfRel<E>> rels = m[i]->get_rels(ctx);
+      std::span<ThunkRef> thunk_refs = m[i]->extra.thunk_refs;

-      for (i64 i = 0; i < rels.size(); i++)
-        if (thunk_refs[i].thunk_idx == thunk_idx)
-          thunk_refs[i].sym_idx = syms[rels[i].r_sym]->extra.thunk_sym_idx;
-    });
+      for (i64 j = 0; j < rels.size(); j++)
+        if (thunk_refs[j].thunk_idx == thunk_idx)
+          thunk_refs[j].sym_idx = syms[rels[j].r_sym]->extra.thunk_sym_idx;
+    }

    // Move B forward to point to the begining of the next batch.
    b = c;