From 54399fe567a0ab1c0ed405aefc8e601ce96ab740 Mon Sep 17 00:00:00 2001 From: Rui Ueyama Date: Tue, 26 Jul 2022 16:14:49 +0800 Subject: [PATCH] [Mach-O] Support LOH_ARM64_ADRP_LDR_GOT_LDR --- macho/arch-arm64.cc | 99 +++++++++++++++++++++++++ macho/cmdline.cc | 3 + macho/input-files.cc | 13 ++++ macho/macho.h | 9 +++ macho/main.cc | 4 + macho/mold.h | 5 +- test/macho/linker-optimization-hints.sh | 39 ++++++++++ 7 files changed, 171 insertions(+), 1 deletion(-) create mode 100755 test/macho/linker-optimization-hints.sh diff --git a/macho/arch-arm64.cc b/macho/arch-arm64.cc index 4af71874..3b60ebe9 100644 --- a/macho/arch-arm64.cc +++ b/macho/arch-arm64.cc @@ -467,4 +467,103 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { } } +#define ASSERT_RANGE(val, start, size) \ + assert((start) <= (val) && (val) < ((start) + (size))) + +// On ARM, we need two or more instructions to materialize an address +// of an object in a register or jump to a function within PC ± 2GiB. +// However, if an object or a function is close enough to PC, a single +// instruction is sufficient to materialize its address. +// +// This function replaces such redundant two or more instruction +// sequence with a single instruction. We don't shrink a section, so +// the total number of instructions won't change by this relaxation, +// but replacing an instruction with a NOP generally increases +// performance since CPU has a special logic to skip a NOP instead of +// executing it. +// +// Locations of relaxable instructions are in the +// LC_LINKER_OPTIMIZATION_HINT segment. That segment contains a +// sequence of ULEB-encoded integers. +void apply_linker_optimization_hints(Context &ctx) { + Timer t(ctx, "apply_linker_optimization_hints"); + + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { + u8 *hints = file->get_linker_optimization_hints(ctx); + if (!hints) + return; + + for (;;) { + i64 type = read_uleb(hints); + if (type == 0) + return; + + i64 nargs = read_uleb(hints); + + switch (type) { + case LOH_ARM64_ADRP_LDR_GOT_LDR: { + i64 addr1 = read_uleb(hints); + i64 addr2 = read_uleb(hints); + i64 addr3 = read_uleb(hints); + + Subsection *subsec = file->find_subsection(ctx, addr1); + if (!subsec || !subsec->is_alive) + break; + + ASSERT_RANGE(addr2, subsec->input_addr, subsec->input_size); + ASSERT_RANGE(addr3, subsec->input_addr, subsec->input_size); + + u8 *loc = ctx.buf + subsec->isec.osec.hdr.offset + subsec->output_offset; + + i64 offset1 = addr1 - subsec->input_addr; + i64 offset2 = addr2 - subsec->input_addr; + i64 offset3 = addr3 - subsec->input_addr; + + ul32 *loc1 = (ul32 *)(loc + offset1); + ul32 *loc2 = (ul32 *)(loc + offset2); + ul32 *loc3 = (ul32 *)(loc + offset3); + + // We expect the following instructions: + // + // adrp reg1, _foo@GOTPAGE + // ldr reg2, [reg1, _foo@GOTPAGEOFF] + // ldr reg3, [reg2] + assert((*loc1 & 0x9f00'0000) == 0x9000'0000); + assert((*loc2 & 0xbfc0'0000) == 0xb940'0000); + assert((*loc3 & 0xbfc0'0000) == 0xb940'0000); + + u64 got_addr = page(subsec->get_addr(ctx) + offset1) + + (bits(*loc1, 23, 5) << 14) + (bits(*loc1, 30, 29) << 12) + + (bits(*loc2, 21, 10) << 3); + + ASSERT_RANGE(got_addr, ctx.got.hdr.addr, ctx.got.hdr.size); + + u64 got_value = *(ul64 *)(ctx.buf + ctx.got.hdr.offset + got_addr - + ctx.got.hdr.addr); + + if (got_value) { + i64 disp = got_value - subsec->get_addr(ctx) - offset2; + if (disp == sign_extend(disp, 20)) { + // If the GOT entry has already been filled, and its value is + // within the range of LDR, we can convert to + // + // nop + // nop + // ldr reg3, _foo + *loc1 = 0xd503'201f; + *loc2 = 0xd503'201f; + *loc3 = 0x1800'0000 | (bits(disp, 20, 2) << 5) | (*loc2 & 0x0000'001f); + break; + } + } + break; + } + default: + for (i64 i = 0; i < nargs; i++) + read_uleb(hints); + } + } + }); +} + } // namespace mold::macho diff --git a/macho/cmdline.cc b/macho/cmdline.cc index 9db14088..fd6bbca3 100644 --- a/macho/cmdline.cc +++ b/macho/cmdline.cc @@ -63,6 +63,7 @@ Options: Allocate MAXPATHLEN byte padding after load commands -help Report usage information -hidden-l + -ignore_optimization_hints Do not rewrite instructions as optimization -install_name -l Search for a given library -lto_library Ignored @@ -387,6 +388,8 @@ std::vector parse_nonpositional_args(Context &ctx) { } else if (read_joined("-hidden-l")) { remaining.push_back("-hidden-l"); remaining.push_back(std::string(arg)); + } else if (read_flag("-ignore_optimization_hints")) { + ctx.arg.ignore_optimization_hints = true; } else if (read_arg("-install_name") || read_arg("-dylib_install_name")) { ctx.arg.install_name = arg; } else if (read_joined("-l")) { diff --git a/macho/input-files.cc b/macho/input-files.cc index 1b3cf570..224364b2 100644 --- a/macho/input-files.cc +++ b/macho/input-files.cc @@ -398,6 +398,9 @@ std::vector ObjectFile::get_linker_options(Context &ctx) { template LoadCommand *ObjectFile::find_load_command(Context &ctx, u32 type) { + if (!this->mf) + return nullptr; + MachHeader &hdr = *(MachHeader *)this->mf->data; u8 *p = this->mf->data + sizeof(hdr); @@ -778,6 +781,16 @@ void ObjectFile::parse_lto_symbols(Context &ctx) { mach_syms = mach_syms2; } +template +u8 *ObjectFile::get_linker_optimization_hints(Context &ctx) { + LinkEditDataCommand *cmd = + (LinkEditDataCommand *)find_load_command(ctx, LC_LINKER_OPTIMIZATION_HINT); + + if (cmd) + return this->mf->data + cmd->dataoff; + return nullptr; +} + template DylibFile::DylibFile(Context &ctx, MappedFile> *mf) : InputFile(mf) { diff --git a/macho/macho.h b/macho/macho.h index 363715ae..5514d4c5 100644 --- a/macho/macho.h +++ b/macho/macho.h @@ -326,6 +326,15 @@ static constexpr u32 OBJC_IMAGE_SUPPORTS_COMPACTION = 1 << 4; static constexpr u32 OBJC_IMAGE_IS_SIMULATED = 1 << 5; static constexpr u32 OBJC_IMAGE_HAS_CATEGORY_CLASS_PROPERTIES = 1 << 6; +static constexpr u32 LOH_ARM64_ADRP_ADRP = 1; +static constexpr u32 LOH_ARM64_ADRP_LDR = 2; +static constexpr u32 LOH_ARM64_ADRP_ADD_LDR = 3; +static constexpr u32 LOH_ARM64_ADRP_LDR_GOT_LDR = 4; +static constexpr u32 LOH_ARM64_ADRP_ADD_STR = 5; +static constexpr u32 LOH_ARM64_ADRP_LDR_GOT_STR = 6; +static constexpr u32 LOH_ARM64_ADRP_ADD = 7; +static constexpr u32 LOH_ARM64_ADRP_LDR_GOT = 8; + static constexpr u32 ARM64_RELOC_UNSIGNED = 0; static constexpr u32 ARM64_RELOC_SUBTRACTOR = 1; static constexpr u32 ARM64_RELOC_BRANCH26 = 2; diff --git a/macho/main.cc b/macho/main.cc index 2cebd098..a2225166 100644 --- a/macho/main.cc +++ b/macho/main.cc @@ -1047,6 +1047,10 @@ static int do_main(int argc, char **argv) { copy_sections_to_output_file(ctx); + if constexpr (std::is_same_v) + if (!ctx.arg.ignore_optimization_hints) + apply_linker_optimization_hints(ctx); + if (ctx.code_sig) ctx.code_sig->write_signature(ctx); else if (ctx.arg.uuid == UUID_HASH) diff --git a/macho/mold.h b/macho/mold.h index 0ea93912..2200f1be 100644 --- a/macho/mold.h +++ b/macho/mold.h @@ -124,6 +124,7 @@ public: std::function *)> feeder); void convert_common_symbols(Context &ctx); void check_duplicate_symbols(Context &ctx); + u8 *get_linker_optimization_hints(Context &ctx); Relocation read_reloc(Context &ctx, const MachSection &hdr, MachRel r); @@ -807,6 +808,7 @@ void do_lto(Context &ctx); // void create_range_extension_thunks(Context &ctx, OutputSection &osec); +void apply_linker_optimization_hints(Context &ctx); // // main.cc @@ -874,13 +876,14 @@ struct Context { bool dynamic = true; bool export_dynamic = false; bool fatal_warnings = false; + bool ignore_optimization_hints = false; + bool mark_dead_strippable_dylib = false; bool noinhibit_exec = false; bool perf = false; bool quick_exit = true; bool search_paths_first = true; bool stats = false; bool trace = false; - bool mark_dead_strippable_dylib = false; i64 arch = CPU_TYPE_ARM64; i64 compatibility_version = 0; i64 current_version = 0; diff --git a/test/macho/linker-optimization-hints.sh b/test/macho/linker-optimization-hints.sh new file mode 100755 index 00000000..21409aee --- /dev/null +++ b/test/macho/linker-optimization-hints.sh @@ -0,0 +1,39 @@ +#!/bin/bash +export LC_ALL=C +set -e +CC="${TEST_CC:-cc}" +CXX="${TEST_CXX:-c++}" +GCC="${TEST_GCC:-gcc}" +GXX="${TEST_GXX:-g++}" +OBJDUMP="${OBJDUMP:-objdump}" +MACHINE="${MACHINE:-$(uname -m)}" +testname=$(basename "$0" .sh) +echo -n "Testing $testname ... " +t=out/test/macho/$MACHINE/$testname +mkdir -p $t + +cat < + +int foo = 0; + +void hello() { + printf("Hello world\n"); +} +EOF + +cat <