1
1
mirror of https://github.com/rui314/mold.git synced 2024-09-11 13:06:59 +03:00
This commit is contained in:
Rui Ueyama 2024-07-06 15:02:50 +09:00
parent 5b4377842b
commit e78e12b15b
4 changed files with 112 additions and 107 deletions

View File

@ -373,7 +373,7 @@ public:
virtual ~Chunk() = default;
virtual bool is_header() { return false; }
virtual OutputSection<E> *to_osec() { return nullptr; }
virtual MergedSection<E> *to_merged_section() { return nullptr; }
virtual void compute_section_size(Context<E> &ctx) {}
virtual i64 get_reldyn_size(Context<E> &ctx) const { return 0; }
virtual void construct_relr(Context<E> &ctx) {}
virtual void copy_buf(Context<E> &ctx) {}
@ -480,6 +480,7 @@ public:
}
OutputSection<E> *to_osec() override { return this; }
void compute_section_size(Context<E> &ctx) override;
void construct_relr(Context<E> &ctx) override;
void copy_buf(Context<E> &ctx) override;
void write_to(Context<E> &ctx, u8 *buf) override;
@ -806,9 +807,8 @@ public:
SectionFragment<E> *insert(Context<E> &ctx, std::string_view data,
u64 hash, i64 p2align);
MergedSection<E> *to_merged_section() override { return this; }
void resolve(Context<E> &ctx);
void assign_offsets(Context<E> &ctx);
void compute_section_size(Context<E> &ctx) override;
void copy_buf(Context<E> &ctx) override;
void write_to(Context<E> &ctx, u8 *buf) override;
void print_stats(Context<E> &ctx);

View File

@ -865,6 +865,84 @@ void DynamicSection<E>::copy_buf(Context<E> &ctx) {
write_vector(ctx.buf + this->shdr.sh_offset, contents);
}
template <typename T>
static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
std::span<T> span(input);
std::vector<std::span<T>> vec;
while (span.size() >= unit) {
vec.push_back(span.subspan(0, unit));
span = span.subspan(unit);
}
if (!span.empty())
vec.push_back(span);
return vec;
}
// Assign offsets to OutputSection members
template <typename E>
void OutputSection<E>::compute_section_size(Context<E> &ctx) {
ElfShdr<E> &shdr = this->shdr;
// On most RISC systems, we need to create so-called "range extension
// thunks" to extend branch instructions reach, as their jump
// instructions' reach is limited. create_range_extension_thunks()
// computes the size of the section while inserting thunks.
if constexpr (needs_thunk<E>) {
if ((shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable) {
create_range_extension_thunks(ctx);
return;
}
}
// Since one output section may contain millions of input sections,
// we first split input sections into groups and assign offsets to
// groups.
struct Group {
std::span<InputSection<E> *> members;
i64 size = 0;
i64 p2align = 0;
i64 offset = 0;
};
std::span<InputSection<E> *> mem = members;
std::vector<Group> groups;
constexpr i64 group_size = 10000;
while (!mem.empty()) {
i64 sz = std::min<i64>(group_size, mem.size());
groups.push_back({mem.subspan(0, sz)});
mem = mem.subspan(sz);
}
tbb::parallel_for_each(groups, [](Group &group) {
for (InputSection<E> *isec : group.members) {
group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
group.p2align = std::max<i64>(group.p2align, isec->p2align);
}
});
shdr.sh_size = 0;
for (i64 i = 0; i < groups.size(); i++) {
shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
groups[i].offset = shdr.sh_size;
shdr.sh_size += groups[i].size;
shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
}
// Assign offsets to input sections.
tbb::parallel_for_each(groups, [](Group &group) {
i64 offset = group.offset;
for (InputSection<E> *isec : group.members) {
offset = align_to(offset, 1 << isec->p2align);
isec->offset = offset;
offset += isec->sh_size;
}
});
}
template <typename E>
void OutputSection<E>::copy_buf(Context<E> &ctx) {
if (this->shdr.sh_type != SHT_NOBITS)
@ -2009,7 +2087,7 @@ void MergedSection<E>::resolve(Context<E> &ctx) {
}
template <typename E>
void MergedSection<E>::assign_offsets(Context<E> &ctx) {
void MergedSection<E>::compute_section_size(Context<E> &ctx) {
if (!resolved)
resolve(ctx);

View File

@ -466,20 +466,6 @@ static std::string get_cmdline_args(Context<E> &ctx) {
return ss.str();
}
template <typename T>
static std::vector<std::span<T>> split(std::vector<T> &input, i64 unit) {
std::span<T> span(input);
std::vector<std::span<T>> vec;
while (span.size() >= unit) {
vec.push_back(span.subspan(0, unit));
span = span.subspan(unit);
}
if (!span.empty())
vec.push_back(span);
return vec;
}
template <typename E>
static bool has_ctors_and_init_array(Context<E> &ctx) {
bool x = false;
@ -1351,84 +1337,24 @@ template <typename E>
void compute_section_sizes(Context<E> &ctx) {
Timer t(ctx, "compute_section_sizes");
struct Group {
i64 size = 0;
i64 p2align = 0;
i64 offset = 0;
std::span<InputSection<E> *> members;
};
// Assign offsets to OutputSection members
tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
OutputSection<E> *osec = chunk->to_osec();
if (!osec)
return;
// This pattern will be processed in the next loop.
if constexpr (needs_thunk<E>)
if ((osec->shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable)
return;
// Since one output section may contain millions of input sections,
// we first split input sections into groups and assign offsets to
// groups.
std::vector<Group> groups;
constexpr i64 group_size = 10000;
for (std::span<InputSection<E> *> span : split(osec->members, group_size))
groups.push_back(Group{.members = span});
tbb::parallel_for_each(groups, [](Group &group) {
for (InputSection<E> *isec : group.members) {
group.size = align_to(group.size, 1 << isec->p2align) + isec->sh_size;
group.p2align = std::max<i64>(group.p2align, isec->p2align);
}
});
ElfShdr<E> &shdr = osec->shdr;
shdr.sh_size = 0;
for (i64 i = 0; i < groups.size(); i++) {
shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align);
groups[i].offset = shdr.sh_size;
shdr.sh_size += groups[i].size;
shdr.sh_addralign = std::max<u32>(shdr.sh_addralign, 1 << groups[i].p2align);
}
// Assign offsets to input sections.
tbb::parallel_for_each(groups, [](Group &group) {
i64 offset = group.offset;
for (InputSection<E> *isec : group.members) {
offset = align_to(offset, 1 << isec->p2align);
isec->offset = offset;
offset += isec->sh_size;
}
});
});
// Assign offsets to MergedSection members
tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
if (MergedSection<E> *sec = chunk->to_merged_section())
sec->assign_offsets(ctx);
});
// On ARM32 or ARM64, we may need to create so-called "range extension
// thunks" to extend branch instructions reach, as they can jump only
// to ±16 MiB or ±128 MiB, respecitvely.
//
// In the following loop, We compute the sizes of sections while
// inserting thunks. This pass cannot be parallelized. That is,
// create_range_extension_thunks is parallelized internally, but the
// function itself is not thread-safe.
if constexpr (needs_thunk<E>) {
Timer t(ctx, "create_range_extension_thunks");
// Chunk<E>::compute_section_size may obtain a global lock to create
// range extension thunks. I don't know why, but using parallel_for
// loop both inside and outside of the lock may cause a deadlock. It
// might be a bug in TBB. For now, I'll avoid using parallel_for_each
// here.
for (Chunk<E> *chunk : ctx.chunks)
if (chunk->shdr.sh_flags & SHF_EXECINSTR)
chunk->compute_section_size(ctx);
if (!ctx.arg.relocatable)
for (Chunk<E> *chunk : ctx.chunks)
if (OutputSection<E> *osec = chunk->to_osec())
if (osec->shdr.sh_flags & SHF_EXECINSTR)
osec->create_range_extension_thunks(ctx);
tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
if (!(chunk->shdr.sh_flags & SHF_EXECINSTR))
chunk->compute_section_size(ctx);
});
} else {
tbb::parallel_for_each(ctx.chunks, [&](Chunk<E> *chunk) {
chunk->compute_section_size(ctx);
});
}
}

View File

@ -170,6 +170,10 @@ static void scan_rels(Context<E> &ctx, InputSection<E> &isec,
template <>
void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
// This function is not thread-safe because it mutates symbols' members
static std::mutex mu;
std::scoped_lock lock(mu);
std::span<InputSection<E> *> m = members;
if (m.empty())
return;
@ -247,10 +251,8 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
// Scan relocations between B and C to collect symbols that need
// entries in the new thunk.
tbb::parallel_for_each(m.begin() + b, m.begin() + c,
[&](InputSection<E> *isec) {
scan_rels(ctx, *isec, *thunk, thunk_idx);
});
for (i64 i = b; i < c; i++)
scan_rels(ctx, *m[i], *thunk, thunk_idx);
// Now that we know the number of symbols in the thunk, we can compute
// the thunk's size.
@ -270,16 +272,15 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
}
// Scan relocations again to fix symbol offsets in the last thunk.
tbb::parallel_for_each(m.begin() + b, m.begin() + c,
[&](InputSection<E> *isec) {
std::span<Symbol<E> *> syms = isec->file.symbols;
std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
std::span<ThunkRef> thunk_refs = isec->extra.thunk_refs;
for (i64 i = b; i < c; i++) {
std::span<Symbol<E> *> syms = m[i]->file.symbols;
std::span<const ElfRel<E>> rels = m[i]->get_rels(ctx);
std::span<ThunkRef> thunk_refs = m[i]->extra.thunk_refs;
for (i64 i = 0; i < rels.size(); i++)
if (thunk_refs[i].thunk_idx == thunk_idx)
thunk_refs[i].sym_idx = syms[rels[i].r_sym]->extra.thunk_sym_idx;
});
for (i64 j = 0; j < rels.size(); j++)
if (thunk_refs[j].thunk_idx == thunk_idx)
thunk_refs[j].sym_idx = syms[rels[j].r_sym]->extra.thunk_sym_idx;
}
// Move B forward to point to the begining of the next batch.
b = c;