1
1
mirror of https://github.com/rui314/mold.git synced 2024-09-11 13:06:59 +03:00

Optimize string merging

Linking clang-13 with debug info takes ~3.6 seconds on a simulated
10-core/20-threads machine. mold spends most of its time (~2.3 seconds)
merging string literals in .debug_str. Input .debug_str sections contain
70 million string literals in total, which is reduced to 2 million after
de-duplication. The input object files contain a lot of duplicates.
clang-13 with debug info is enormous -- it is ~3.1 GiB after linking.

It looks like TBB's concurrent hashmap doesn't scale well with the
input.

In this patch, I implemented our own concurrent hashmap. The hashmap
is extremely lightweight and support only the key-value insertion
operation. It doesn't even support rehashing. It aborts once the hash
table becomes full.

In order to know the correct size for the hashmap before inserting
strings into it, I also implemented HyperLogLog algorithm in this patch.
HyperLogLog is an algorithm that gives a fairly accurate estimate on
the number of unique elements.

With this patch, mold can link clang-13 in ~2.5 seconds, which is ~30%
faster than before.

https://github.com/rui314/mold/issues/73
This commit is contained in:
Rui Ueyama 2021-07-12 11:00:10 +09:00
parent 6bb091595e
commit 41b2fa7375
8 changed files with 288 additions and 111 deletions

View File

@ -8,11 +8,12 @@ CPPFLAGS = -g -Imimalloc/include -pthread -std=c++20 \
-DGIT_HASH=\"$(GIT_HASH)\" \
$(EXTRA_CPPFLAGS)
LDFLAGS += $(EXTRA_LDFLAGS) -rdynamic
LIBS = -Wl,-as-needed -lcrypto -pthread -lz -lxxhash -ldl
LIBS = -Wl,-as-needed -lcrypto -pthread -lz -lxxhash -ldl -lm
OBJS = main.o object_file.o input_sections.o output_chunks.o \
mapfile.o perf.o linker_script.o archive_file.o output_file.o \
subprocess.o gc_sections.o icf.o symbols.o cmdline.o filepath.o \
passes.o tar.o compress.o memory_mapped_file.o relocatable.o \
concurrent_map.o hyperloglog.o \
arch_x86_64.o arch_i386.o arch_aarch64.o
PREFIX ?= /usr

80
concurrent_map.cc Normal file
View File

@ -0,0 +1,80 @@
#include "mold.h"
static const char *locked = (char *)-1;
static constexpr i64 MIN_NBUCKETS = 256;
template <typename T>
ConcurrentMap<T>::ConcurrentMap() {}
template <typename T>
ConcurrentMap<T>::ConcurrentMap(i64 nbuckets) {
resize(nbuckets);
}
template <typename T>
void ConcurrentMap<T>::resize(i64 nbuckets) {
this->~ConcurrentMap();
nbuckets = std::max<i64>(MIN_NBUCKETS, next_power_of_two(nbuckets));
this->nbuckets = nbuckets;
keys = (std::atomic<const char *> *)calloc(nbuckets, sizeof(keys[0]));
sizes = (u32 *)calloc(nbuckets, sizeof(sizes[0]));
values = (T *)calloc(nbuckets, sizeof(values[0]));
}
template <typename T>
ConcurrentMap<T>::~ConcurrentMap() {
if (keys) {
free((void *)keys);
free((void *)sizes);
free((void *)values);
}
}
template <typename T>
std::pair<T *, bool>
ConcurrentMap<T>::insert(std::string_view key, u64 hash, const T &val) {
if (!keys)
return {nullptr, false};
ASSERT(__builtin_popcount(nbuckets) == 1);
i64 idx = hash & (nbuckets - 1);
i64 nretry = 0;
while (nretry < MIN_NBUCKETS) {
const char *ptr = keys[idx];
if (ptr == locked) {
#ifdef __x86_64__
asm volatile("pause" ::: "memory");
#endif
continue;
}
if (ptr == nullptr) {
if (!keys[idx].compare_exchange_strong(ptr, locked))
continue;
new (values + idx) T(val);
sizes[idx] = key.size();
keys[idx] = key.data();
return {values + idx, true};
}
if (key.size() == sizes[idx] && memcmp(ptr, key.data(), sizes[idx]) == 0)
return {values + idx, false};
idx = (idx + 1) & (nbuckets - 1);
nretry++;
}
ASSERT(false && "ConcurrentMap is full");
return {nullptr, false};
}
#define INSTANTIATE(E) \
template class ConcurrentMap<SectionFragment<E>>;
INSTANTIATE(X86_64);
INSTANTIATE(I386);
INSTANTIATE(AARCH64);

21
hyperloglog.cc Normal file
View File

@ -0,0 +1,21 @@
// This file implements HyperLogLog algorithm, which estimates
// the number of unique items in a given multiset.
//
// For more info, read
// https://engineering.fb.com/2018/12/13/data-infrastructure/hyperloglog
#include "mold.h"
#include <cmath>
i64 HyperLogLog::get_cardinality() const {
double z = 0;
for (i64 val : buckets)
z += pow(2, -val);
return ALPHA * NBUCKETS * NBUCKETS / z;
}
void HyperLogLog::merge(const HyperLogLog &other) {
for (i64 i = 0; i < NBUCKETS; i++)
merge_one(i, other.buckets[i]);
}

View File

@ -415,6 +415,13 @@ int do_main(int argc, char **argv) {
if (ctx.objs.empty())
Fatal(ctx) << "no input files";
{
Timer t(ctx, "register_section_pieces");
tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
file->register_section_pieces(ctx);
});
}
// Uniquify shared object files by soname
{
std::unordered_set<std::string_view> seen;

86
mold.h
View File

@ -63,6 +63,8 @@ template <typename E> class ROutputShdr;
template <typename E> class RStrtabSection;
template <typename E> class RSymtabSection;
template <typename T> class ConcurrentMap;
class ZlibCompressor;
class GzipCompressor;
class TarFile;
@ -302,6 +304,59 @@ private:
void report_undef(Context<E> &ctx, Symbol<E> &sym);
};
//
// hyperloglog.cc
//
class HyperLogLog {
public:
HyperLogLog() : buckets(NBUCKETS) {}
void insert(u32 hash) {
merge_one(hash & (NBUCKETS - 1), __builtin_clz(hash) + 1);
}
void merge_one(i64 idx, u8 newval) {
u8 cur = buckets[idx];
while (cur < newval)
if (buckets[idx].compare_exchange_strong(cur, newval))
break;
}
i64 get_cardinality() const;
void merge(const HyperLogLog &other);
private:
static constexpr i64 NBUCKETS = 2048;
static constexpr double ALPHA = 0.79402;
std::vector<std::atomic_uint8_t> buckets;
};
//
// concurrent_map.cc
//
template <typename T>
class ConcurrentMap {
public:
ConcurrentMap();
ConcurrentMap(i64 nbuckets);
~ConcurrentMap();
void resize(i64 nbuckets);
std::pair<T *, bool> insert(std::string_view key, u64 hash, const T &val);
bool has_key(i64 idx) {
return keys[idx];
}
i64 nbuckets = 0;
std::atomic<const char *> *keys = nullptr;
u32 *sizes = nullptr;
T *values = nullptr;
};
//
// output_chunks.cc
//
@ -645,27 +700,22 @@ public:
static MergedSection<E> *
get_instance(Context<E> &ctx, std::string_view name, u64 type, u64 flags);
SectionFragment<E> *insert(std::string_view data, i64 alignment);
void assign_offsets();
SectionFragment<E> *insert(std::string_view data, u64 hash, i64 alignment);
void assign_offsets(Context<E> &ctx);
void copy_buf(Context<E> &ctx) override;
void write_to(Context<E> &ctx, u8 *buf) override;
private:
using MapTy =
tbb::concurrent_unordered_map<std::string_view, SectionFragment<E>>;
HyperLogLog estimator;
private:
static constexpr i64 NUM_SHARDS = 64;
MergedSection(std::string_view name, u64 flags, u32 type)
: OutputChunk<E>(this->SYNTHETIC) {
this->name = name;
this->shdr.sh_flags = flags;
this->shdr.sh_type = type;
}
MergedSection(std::string_view name, u64 flags, u32 type);
MapTy maps[NUM_SHARDS];
ConcurrentMap<SectionFragment<E>> map;
i64 shard_offsets[NUM_SHARDS + 1] = {};
tbb::enumerable_thread_specific<i64> max_alignments;
std::once_flag once_flag;
};
template <typename E>
@ -869,6 +919,16 @@ struct ComdatGroup {
std::atomic_uint32_t owner = -1;
};
template <typename E>
struct MergeableSection {
MergedSection<E> *parent;
ElfShdr<E> shdr;
std::vector<std::string_view> strings;
std::vector<u64> hashes;
std::vector<u32> frag_offsets;
std::vector<SectionFragment<E> *> fragments;
};
// InputFile is the base class of ObjectFile and SharedFile.
template <typename E>
class InputFile {
@ -911,6 +971,7 @@ public:
static ObjectFile<E> *create_internal_file(Context<E> &ctx);
void parse(Context<E> &ctx);
void register_section_pieces(Context<E> &ctx);
void resolve_lazy_symbols(Context<E> &ctx);
void resolve_regular_symbols(Context<E> &ctx);
void mark_live_objects(Context<E> &ctx,
@ -981,6 +1042,7 @@ private:
std::string_view symbol_strtab;
const ElfShdr<E> *symtab_sec;
std::span<u32> symtab_shndx_sec;
std::vector<std::unique_ptr<MergeableSection<E>>> mergeable_sections;
};
// SharedFile represents an input .so file.

View File

@ -510,12 +510,6 @@ void ObjectFile<E>::initialize_symbols(Context<E> &ctx) {
}
}
template <typename E>
struct MergeableSection {
std::vector<SectionFragment<E> *> fragments;
std::vector<u32> frag_offsets;
};
static size_t find_null(std::string_view data, u64 entsize) {
if (entsize == 1)
return data.find('\0');
@ -545,17 +539,17 @@ static size_t find_null(std::string_view data, u64 entsize) {
//
// We do not support mergeable sections that have relocations.
template <typename E>
static MergeableSection<E>
static std::unique_ptr<MergeableSection<E>>
split_section(Context<E> &ctx, InputSection<E> &sec) {
MergeableSection<E> rec;
MergedSection<E> *parent =
MergedSection<E>::get_instance(ctx, sec.name(), sec.shdr.sh_type,
sec.shdr.sh_flags);
std::unique_ptr<MergeableSection<E>> rec(new MergeableSection<E>);
rec->parent = MergedSection<E>::get_instance(ctx, sec.name(), sec.shdr.sh_type,
sec.shdr.sh_flags);
rec->shdr = sec.shdr;
std::string_view data = sec.contents;
const char *begin = data.data();
u64 entsize = sec.shdr.sh_entsize;
HyperLogLog estimator;
static_assert(sizeof(SectionFragment<E>::alignment) == 2);
if (sec.shdr.sh_addralign >= UINT16_MAX)
@ -570,9 +564,12 @@ split_section(Context<E> &ctx, InputSection<E> &sec) {
std::string_view substr = data.substr(0, end + entsize);
data = data.substr(end + entsize);
SectionFragment<E> *frag = parent->insert(substr, sec.shdr.sh_addralign);
rec.fragments.push_back(frag);
rec.frag_offsets.push_back(substr.data() - begin);
rec->strings.push_back(substr);
rec->frag_offsets.push_back(substr.data() - begin);
u64 hash = hash_string(substr);
rec->hashes.push_back(hash);
estimator.insert(hash);
}
} else {
if (data.size() % entsize)
@ -582,15 +579,19 @@ split_section(Context<E> &ctx, InputSection<E> &sec) {
std::string_view substr = data.substr(0, entsize);
data = data.substr(entsize);
SectionFragment<E> *frag = parent->insert(substr, sec.shdr.sh_addralign);
rec.fragments.push_back(frag);
rec.frag_offsets.push_back(substr.data() - begin);
rec->strings.push_back(substr);
rec->frag_offsets.push_back(substr.data() - begin);
u64 hash = hash_string(substr);
rec->hashes.push_back(hash);
estimator.insert(hash);
}
}
static Counter counter("string_fragments");
counter += rec.fragments.size();
rec->parent->estimator.merge(estimator);
static Counter counter("string_fragments");
counter += rec->fragments.size();
return rec;
}
@ -638,7 +639,7 @@ split_section(Context<E> &ctx, InputSection<E> &sec) {
// is attached to the symbol.
template <typename E>
void ObjectFile<E>::initialize_mergeable_sections(Context<E> &ctx) {
std::vector<MergeableSection<E>> mergeable_sections(sections.size());
mergeable_sections.resize(sections.size());
for (i64 i = 0; i < sections.size(); i++) {
std::unique_ptr<InputSection<E>> &isec = sections[i];
@ -648,6 +649,15 @@ void ObjectFile<E>::initialize_mergeable_sections(Context<E> &ctx) {
isec->is_alive = false;
}
}
}
template <typename E>
void ObjectFile<E>::register_section_pieces(Context<E> &ctx) {
for (std::unique_ptr<MergeableSection<E>> &m : mergeable_sections)
if (m)
for (i64 i = 0; i < m->strings.size(); i++)
m->fragments.push_back(m->parent->insert(m->strings[i], m->hashes[i],
m->shdr.sh_addralign));
// Initialize rel_fragments
for (std::unique_ptr<InputSection<E>> &isec : sections) {
@ -663,13 +673,10 @@ void ObjectFile<E>::initialize_mergeable_sections(Context<E> &ctx) {
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
const ElfSym<E> &esym = elf_syms[rel.r_sym];
if (esym.st_type == STT_SECTION) {
MergeableSection<E> &m = mergeable_sections[get_shndx(esym)];
if (!m.fragments.empty())
len++;
}
if (esym.st_type == STT_SECTION && mergeable_sections[get_shndx(esym)])
len++;
}
if (len == 0)
continue;
@ -683,19 +690,20 @@ void ObjectFile<E>::initialize_mergeable_sections(Context<E> &ctx) {
if (esym.st_type != STT_SECTION)
continue;
MergeableSection<E> &m = mergeable_sections[get_shndx(esym)];
if (m.fragments.empty())
std::unique_ptr<MergeableSection<E>> &m =
mergeable_sections[get_shndx(esym)];
if (!m)
continue;
i64 offset = esym.st_value + isec->get_addend(rel);
std::span<u32> offsets = m.frag_offsets;
std::span<u32> offsets = m->frag_offsets;
auto it = std::upper_bound(offsets.begin(), offsets.end(), offset);
if (it == offsets.begin())
Fatal(ctx) << *this << ": bad relocation at " << rel.r_sym;
i64 idx = it - 1 - offsets.begin();
isec->rel_fragments[frag_idx++] = {m.fragments[idx], (i32)i,
isec->rel_fragments[frag_idx++] = {m->fragments[idx], (i32)i,
(i32)(offset - offsets[idx])};
}
@ -708,11 +716,12 @@ void ObjectFile<E>::initialize_mergeable_sections(Context<E> &ctx) {
if (esym.is_abs() || esym.is_common())
continue;
MergeableSection<E> &m = mergeable_sections[get_shndx(esym)];
if (m.fragments.empty())
std::unique_ptr<MergeableSection<E>> &m =
mergeable_sections[get_shndx(esym)];
if (!m)
continue;
std::span<u32> offsets = m.frag_offsets;
std::span<u32> offsets = m->frag_offsets;
auto it = std::upper_bound(offsets.begin(), offsets.end(), esym.st_value);
if (it == offsets.begin())
@ -722,12 +731,13 @@ void ObjectFile<E>::initialize_mergeable_sections(Context<E> &ctx) {
if (i < first_global)
this->symbols[i]->value = esym.st_value - offsets[idx];
sym_fragments[i].frag = m.fragments[idx];
sym_fragments[i].frag = m->fragments[idx];
sym_fragments[i].addend = esym.st_value - offsets[idx];
}
for (MergeableSection<E> &m : mergeable_sections)
fragments.insert(fragments.end(), m.fragments.begin(), m.fragments.end());
for (std::unique_ptr<MergeableSection<E>> &m : mergeable_sections)
if (m)
fragments.insert(fragments.end(), m->fragments.begin(), m->fragments.end());
}
template <typename E>

View File

@ -1115,6 +1115,14 @@ void GnuHashSection<E>::copy_buf(Context<E> &ctx) {
}
}
template <typename E>
MergedSection<E>::MergedSection(std::string_view name, u64 flags, u32 type)
: OutputChunk<E>(this->SYNTHETIC) {
this->name = name;
this->shdr.sh_flags = flags;
this->shdr.sh_type = type;
}
template <typename E>
MergedSection<E> *
MergedSection<E>::get_instance(Context<E> &ctx, std::string_view name,
@ -1150,78 +1158,63 @@ MergedSection<E>::get_instance(Context<E> &ctx, std::string_view name,
template <typename E>
SectionFragment<E> *
MergedSection<E>::insert(std::string_view data, i64 alignment) {
MergedSection<E>::insert(std::string_view data, u64 hash, i64 alignment) {
ASSERT(alignment < UINT16_MAX);
std::string_view suffix = data;
if (suffix.size() > 32)
suffix = suffix.substr(suffix.size() - 32);
i64 shard = hash_string(suffix) % NUM_SHARDS;
std::call_once(once_flag, [&]() {
// We aim 2/3 occupation ratio
map.resize(estimator.get_cardinality() * 3 / 2);
});
SectionFragment<E> *frag;
{
auto [it, inserted] =
maps[shard].insert(std::pair(data, SectionFragment(this, data)));
frag = &it->second;
}
bool inserted;
std::tie(frag, inserted) = map.insert(data, hash, SectionFragment(this, data));
ASSERT(frag);
for (u16 cur = frag->alignment; cur < alignment;)
if (frag->alignment.compare_exchange_strong(cur, alignment))
break;
max_alignments.local() = std::max(max_alignments.local(), alignment);
return frag;
}
template <typename E>
void MergedSection<E>::assign_offsets() {
std::vector<SectionFragment<E> *> fragments[NUM_SHARDS];
i64 sizes[NUM_SHARDS] = {};
void MergedSection<E>::assign_offsets(Context<E> &ctx) {
std::vector<SectionFragment<E> *> fragments(map.nbuckets);
for (i64 i = 0; i < map.nbuckets; i++)
fragments[i] = map.values + i;
tbb::parallel_for((i64)0, NUM_SHARDS, [&](i64 i) {
for (auto it = maps[i].begin(); it != maps[i].end(); it++)
if (SectionFragment<E> &frag = it->second; frag.is_alive)
fragments[i].push_back(&frag);
// Sort section fragments to make an output deterministic.
std::sort(fragments[i].begin(), fragments[i].end(),
[&](SectionFragment<E> *a, SectionFragment<E> *b) {
if (a->alignment != b->alignment)
return a->alignment > b->alignment;
if (a->data.size() != b->data.size())
return a->data.size() < b->data.size();
return a->data < b->data;
});
i64 offset = 0;
for (SectionFragment<E> *frag : fragments[i]) {
offset = align_to(offset, frag->alignment);
frag->offset = offset;
offset += frag->data.size();
}
sizes[i] = offset;
// Sort fragments to make output deterministic.
tbb::parallel_sort(fragments.begin(), fragments.end(),
[](SectionFragment<E> *a, SectionFragment<E> *b) {
if (!a->is_alive || !b->is_alive)
return a->is_alive && !b->is_alive;
if (a->alignment != b->alignment)
return a->alignment < b->alignment;
if (a->data.size() != b->data.size())
return a->data.size() < b->data.size();
return a->data < b->data;
});
i64 alignment = 1;
for (i64 x : max_alignments)
alignment = std::max(alignment, x);
for (i64 i = 1; i < NUM_SHARDS + 1; i++)
shard_offsets[i] =
align_to(shard_offsets[i - 1] + sizes[i - 1], alignment);
tbb::parallel_for((i64)1, NUM_SHARDS, [&](i64 i) {
for (SectionFragment<E> *frag : fragments[i])
frag->offset += shard_offsets[i];
// Remove dead fragments.
auto mid = std::partition_point(fragments.begin(), fragments.end(),
[](SectionFragment<E> *frag) -> bool {
return frag->is_alive;
});
fragments.resize(mid - fragments.begin());
this->shdr.sh_size = shard_offsets[NUM_SHARDS];
this->shdr.sh_addralign = alignment;
// Assign offsets.
i64 offset = 0;
for (SectionFragment<E> *frag : fragments) {
offset = align_to(offset, frag->alignment);
frag->offset = offset;
offset += frag->data.size();
this->shdr.sh_addralign =
std::max<i64>(this->shdr.sh_addralign, frag->alignment);
}
this->shdr.sh_size = offset;
static Counter merged_strings("merged_strings");
for (std::span<SectionFragment<E> *> span : fragments)
merged_strings += span.size();
merged_strings += fragments.size();
}
template <typename E>
@ -1231,11 +1224,12 @@ void MergedSection<E>::copy_buf(Context<E> &ctx) {
template <typename E>
void MergedSection<E>::write_to(Context<E> &ctx, u8 *buf) {
tbb::parallel_for((i64)0, NUM_SHARDS, [&](i64 i) {
memset(buf + shard_offsets[i], 0, shard_offsets[i + 1] - shard_offsets[i]);
for (auto it = maps[i].begin(); it != maps[i].end(); it++)
if (SectionFragment<E> &frag = it->second; frag.is_alive)
memcpy(buf + frag.offset, frag.data.data(), frag.data.size());
memset(buf, 0, this->shdr.sh_size);
tbb::parallel_for_each(map.values, map.values + map.nbuckets,
[&](SectionFragment<E> &frag) {
if (frag.is_alive)
memcpy(buf + frag.offset, frag.data.data(), frag.data.size());
});
}

View File

@ -198,7 +198,8 @@ void add_comment_string(Context<E> &ctx, std::string str) {
std::string_view buf = save_string(ctx, str);
MergedSection<E> *sec =
MergedSection<E>::get_instance(ctx, ".comment", SHT_PROGBITS, 0);
SectionFragment<E> *frag = sec->insert({buf.data(), buf.size() + 1}, 1);
std::string_view data(buf.data(), buf.size() + 1);
SectionFragment<E> *frag = sec->insert(data, hash_string(data), 1);
frag->is_alive = true;
}
@ -221,9 +222,10 @@ void compute_merged_section_sizes(Context<E> &ctx) {
if (char *env = getenv("MOLD_DEBUG"); env && env[0])
add_comment_string(ctx, "mold command line: " + get_cmdline_args(ctx));
Timer t2(ctx, "MergedSection assign_offsets");
tbb::parallel_for_each(ctx.merged_sections,
[](std::unique_ptr<MergedSection<E>> &sec) {
sec->assign_offsets();
[&](std::unique_ptr<MergedSection<E>> &sec) {
sec->assign_offsets(ctx);
});
}