[ELF] Factor out DWARF-related code to dwarf.cc

2024-10-04 16:48:04 +03:00 · 2022-04-19 12:28:37 +08:00 · 2022-04-19 12:28:37 +08:00 · a4398c7c32
commit a4398c7c32
parent 7ebd071273
4 changed files with 301 additions and 342 deletions
--- a/elf/dwarf.cc
+++ b/elf/dwarf.cc
@ -0,0 +1,286 @@
+#include "mold.h"
+
+namespace mold::elf {
+
+// The hash function for .gdb_index.
+static u32 gdb_hash(std::string_view name) {
+  u32 h = 0;
+  for (u8 c : name) {
+    if ('A' <= c && c <= 'Z')
+      c = 'a' + c - 'A';
+    h = h * 67 + c - 113;
+  }
+  return h;
+}
+
+// Split .debug_info into so-called "compilation units". A .debug_info
+// section usually contains one compunit unless it was created by `ld -r`.
+// This is for --gdb-index.
+template <typename E>
+std::vector<std::string_view>
+read_compunits(Context<E> &ctx, ObjectFile<E> &file) {
+  file.debug_info->uncompress(ctx);
+  std::string_view data = file.debug_info->contents;
+  std::vector<std::string_view> vec;
+
+  while (!data.empty()) {
+    if (data.size() < 4)
+      Fatal(ctx) << *file.debug_info << ": corrupted .debug_info";
+    i64 len = *(u32 *)data.data() + 4;
+    vec.push_back(data.substr(0, len));
+    data = data.substr(len);
+  }
+  return vec;
+}
+
+// Parses .debug_gnu_pubnames and .debug_gnu_pubtypes. These sections
+// start with a 14 bytes header followed by (4-byte offset, 1-byte type,
+// null-terminated string) tuples.
+//
+// The 4-byte offset is an offset into .debug_info that contains details
+// about the name. The 1-byte type is a type of the corresponding name
+// (e.g. function, variable or datatype). The string is a name of a
+// function, a variable or a type.
+template <typename E>
+std::vector<GdbIndexName> read_pubnames(Context<E> &ctx, ObjectFile<E> &file) {
+  std::vector<GdbIndexName> vec;
+
+  auto get_cu_idx = [&](InputSection<E> &isec, i64 offset) {
+    i64 off = 0;
+    for (i64 i = 0; i < file.compunits.size(); i++) {
+      if (offset == off)
+        return file.compunits_idx + i;
+      off += file.compunits[i].size();
+    }
+    Fatal(ctx) << isec << ": corrupted debug_info_offset";
+  };
+
+  auto read = [&](InputSection<E> &isec) {
+    isec.uncompress(ctx);
+    std::string_view contents = isec.contents;
+
+    while (!contents.empty()) {
+      if (contents.size() < 14)
+        Fatal(ctx) << isec << ": corrupted header";
+
+      u32 len = *(u32 *)contents.data() + 4;
+      u32 debug_info_offset = *(u32 *)(contents.data() + 6);
+      u32 cu_idx = get_cu_idx(isec, debug_info_offset);
+
+      std::string_view data = contents.substr(14, len - 14);
+      contents = contents.substr(len);
+
+      while (!data.empty()) {
+        u32 offset = *(u32 *)data.data();
+        data = data.substr(4);
+        if (offset == 0)
+          break;
+
+        u8 type = data[0];
+        data = data.substr(1);
+
+        std::string_view name = data.data();
+        data = data.substr(name.size() + 1);
+
+        vec.push_back({name, gdb_hash(name), offset + debug_info_offset,
+                       (type << 24) | cu_idx});
+      }
+    }
+  };
+
+  if (file.debug_pubnames)
+    read(*file.debug_pubnames);
+  if (file.debug_pubtypes)
+    read(*file.debug_pubtypes);
+  return vec;
+}
+
+// Try to find a compilation unit from .debug_info and its
+// corresponding record from .debug_abbrev and returns them.
+template <typename E>
+static std::pair<u8 *, u8 *>
+find_compunit(Context<E> &ctx, ObjectFile<E> &file, i64 offset) {
+  // Read .debug_info to find the record at a given offset.
+  u8 *cu = (u8 *)(ctx.buf + ctx.debug_info->shdr.sh_offset + offset);
+  u32 dwarf_version = *(u16 *)(cu + 4);
+  u32 abbrev_offset;
+
+  switch (dwarf_version) {
+  case 4:
+    abbrev_offset = *(u32 *)(cu + 6);
+    cu += 11;
+    break;
+  case 5:
+    abbrev_offset = *(u32 *)(cu + 8);
+    cu += 12;
+    break;
+  default:
+    Fatal(ctx) << file << ": --gdb-index: unknown DWARF version "
+               << dwarf_version;
+  }
+
+  u32 abbrev_code = read_uleb(cu);
+
+  // Find a .debug_abbrev record corresponding to the .debug_info record.
+  // We assume the .debug_info record at a given offset is of
+  // DW_TAG_compile_unit which describes a compunit.
+  u8 *abbrev = (u8 *)(ctx.buf + ctx.debug_abbrev->shdr.sh_offset + abbrev_offset);
+
+  for (;;) {
+    u32 code = read_uleb(abbrev);
+    if (code == 0) {
+      Fatal(ctx) << file << ": --gdb-index: .debug_abbrev does not contain"
+                 << " a record for the first .debug_info record";
+      return {};
+    }
+
+    if (code == abbrev_code) {
+      // Found a record
+      u64 abbrev_tag = read_uleb(abbrev);
+      if (abbrev_tag != DW_TAG_compile_unit) {
+        Fatal(ctx) << file << ": --gdb-index: the first entry's tag is not "
+                   << " DW_TAG_compile_unit but 0x" << std::hex << abbrev_tag;
+        return {};
+      }
+      break;
+    }
+
+    // Skip an uninteresting record
+    for (;;) {
+      u64 name = read_uleb(abbrev);
+      u64 form = read_uleb(abbrev);
+      if (name == 0 && form == 0)
+        break;
+    }
+  }
+
+  abbrev++; // skip has_children byte
+  return {cu, abbrev};
+}
+
+// Returns a list of address ranges explained by a compunit at the
+// `offset` in an output .debug_info section.
+//
+// .debug_info contains DWARF debug info records, so this function
+// parses DWARF. If a designated compunit contains multiple ranges, the
+// ranges are read from .debug_ranges. Otherwise, a range is read
+// directly from .debug_info.
+template <typename E>
+std::vector<u64>
+read_address_areas(Context<E> &ctx, ObjectFile<E> &file, i64 offset) {
+  u8 *cu;
+  u8 *abbrev;
+  std::tie(cu, abbrev) = find_compunit(ctx, file, offset);
+
+  std::optional<u64> low_pc;
+
+  for (;;) {
+    u64 name = read_uleb(abbrev);
+    u64 form = read_uleb(abbrev);
+    if (name == 0 && form == 0)
+      break;
+
+    auto read_value = [&]() -> u64 {
+      switch (form) {
+      case DW_FORM_flag_present:
+        return 0;
+      case DW_FORM_data1:
+      case DW_FORM_flag:
+      case DW_FORM_strx1:
+      case DW_FORM_addrx1:
+      case DW_FORM_ref1:
+        return *cu++;
+      case DW_FORM_data2:
+      case DW_FORM_strx2:
+      case DW_FORM_addrx2:
+      case DW_FORM_ref2: {
+        u64 val = *(u16 *)cu;
+        cu += 2;
+        return val;
+      }
+      case DW_FORM_data4:
+      case DW_FORM_strp:
+      case DW_FORM_sec_offset:
+      case DW_FORM_line_strp:
+      case DW_FORM_strx4:
+      case DW_FORM_addrx4:
+      case DW_FORM_ref4: {
+        u64 val = *(u32 *)cu;
+        cu += 4;
+        return val;
+      }
+      case DW_FORM_data8:
+      case DW_FORM_ref8: {
+        u64 val = *(u64 *)cu;
+        cu += 8;
+        return val;
+      }
+      case DW_FORM_addr:
+      case DW_FORM_ref_addr: {
+        u64 val = *(typename E::WordTy *)cu;
+        cu += E::word_size;
+        return val;
+      }
+      case DW_FORM_strx:
+      case DW_FORM_addrx:
+      case DW_FORM_ref_udata:
+        return read_uleb(cu);
+      case DW_FORM_string: {
+        while (*cu)
+          cu++;
+        cu++;
+        return 0;
+      }
+      default:
+        Fatal(ctx) << file << ": --gdb-index: unknown debug info form: 0x"
+                   << std::hex << form;
+        return 0;
+      }
+    };
+
+    switch (name) {
+    case DW_AT_low_pc:
+      *low_pc = read_value();
+      break;
+    case DW_AT_high_pc:
+      if (low_pc)
+        Fatal(ctx) << file << ": --gdb-index: missing DW_AT_low_pc";
+
+      if (form == DW_FORM_addr)
+        return {*low_pc, read_value()};
+      return {*low_pc, *low_pc + read_value()};
+    case DW_AT_ranges: {
+      if (!ctx.debug_ranges)
+        Fatal(ctx) << file << ": --gdb-index: missing debug_ranges";
+
+      u64 offset = read_value();
+      typename E::WordTy *range =
+        (typename E::WordTy *)(ctx.buf + ctx.debug_ranges->shdr.sh_offset + offset);
+
+      std::vector<u64> vec;
+      for (i64 i = 0; range[i] || range[i + 1]; i += 2) {
+        vec.push_back(range[i]);
+        vec.push_back(range[i + 1]);
+      }
+      return vec;
+    }
+    default:
+      read_value();
+      break;
+    }
+  }
+
+  return {};
+}
+
+#define INSTANTIATE(E)                                                  \
+  template std::vector<std::string_view>                                \
+  read_compunits(Context<E> &, ObjectFile<E> &);                        \
+  template std::vector<GdbIndexName>                                    \
+  read_pubnames(Context<E> &, ObjectFile<E> &);                         \
+  template std::vector<u64>                                             \
+  read_address_areas(Context<E> &, ObjectFile<E> &, i64)
+
+INSTANTIATE_ALL;
+
+} // namespace mold::elf
--- a/elf/mold.h
+++ b/elf/mold.h
@ -940,17 +940,6 @@ private:
  i64 num_symtab_entries = 0;
  i64 attrs_size = 0;

-  std::vector<std::string_view>
-  read_compunits(Context<E> &ctx, ObjectFile<E> &file);
-
-  std::vector<GdbIndexName> read_pubnames(Context<E> &ctx, ObjectFile<E> &file);
-
-  std::pair<u8 *, u8 *> find_compunit(Context<E> &ctx, ObjectFile<E> &file,
-                                      i64 offset);
-
-  std::vector<u64> read_address_areas(Context<E> &ctx, ObjectFile<E> &file,
-                                      i64 offset);
-
  ConcurrentMap<MapEntry> map;
 };

@ -994,6 +983,21 @@ private:

 bool is_c_identifier(std::string_view name);

+//
+// dwarf.cc
+//
+
+template <typename E>
+std::vector<std::string_view>
+read_compunits(Context<E> &ctx, ObjectFile<E> &file);
+
+template <typename E>
+std::vector<GdbIndexName> read_pubnames(Context<E> &ctx, ObjectFile<E> &file);
+
+template <typename E>
+std::vector<u64>
+read_address_areas(Context<E> &ctx, ObjectFile<E> &file, i64 offset);
+
 //
 // input-files.cc
 //
--- a/elf/output-chunks.cc
+++ b/elf/output-chunks.cc
@ -31,17 +31,6 @@ static u32 djb_hash(std::string_view name) {
  return h;
 }

-// The hash function for .gdb_index.
-static u32 gdb_hash(std::string_view name) {
-  u32 h = 0;
-  for (u8 c : name) {
-    if ('A' <= c && c <= 'Z')
-      c = 'a' + c - 'A';
-    h = h * 67 + c - 113;
-  }
-  return h;
-}
-
 template <typename E>
 void Chunk<E>::write_to(Context<E> &ctx, u8 *buf) {
  Fatal(ctx) << name << ": write_to is called on an invalid section";
@ -2413,269 +2402,6 @@ void GdbIndexSection<E>::write_address_areas(Context<E> &ctx) {
  });
 }

-// Returns the list of compilation units in .gdb_index. A .gdb_index
-// usually contains only one compilatation unit unless the object was
-// built by `ld -r`.
-template <typename E>
-std::vector<std::string_view>
-GdbIndexSection<E>::read_compunits(Context<E> &ctx, ObjectFile<E> &file) {
-  file.debug_info->uncompress(ctx);
-  std::string_view data = file.debug_info->contents;
-  std::vector<std::string_view> vec;
-
-  while (!data.empty()) {
-    if (data.size() < 4)
-      Fatal(ctx) << *file.debug_info << ": corrupted .debug_info";
-    i64 len = *(u32 *)data.data() + 4;
-    vec.push_back(data.substr(0, len));
-    data = data.substr(len);
-  }
-  return vec;
-}
-
-// Parses .debug_gnu_pubnames and .debug_gnu_pubtypes. These sections
-// start with a 14 bytes header followed by (4-byte offset, 1-byte type,
-// null-terminated string) tuples.
-//
-// The 4-byte offset is an offset into .debug_info that contains details
-// about the name. The 1-byte type is a type of the corresponding name
-// (e.g. function, variable or datatype). The string is a name of a
-// function, a variable or a type.
-template <typename E>
-std::vector<GdbIndexName>
-GdbIndexSection<E>::read_pubnames(Context<E> &ctx, ObjectFile<E> &file) {
-  std::vector<GdbIndexName> vec;
-
-  auto get_cu_idx = [&](InputSection<E> &isec, i64 offset) {
-    i64 off = 0;
-    for (i64 i = 0; i < file.compunits.size(); i++) {
-      if (offset == off)
-        return file.compunits_idx + i;
-      off += file.compunits[i].size();
-    }
-    Fatal(ctx) << isec << ": corrupted debug_info_offset";
-  };
-
-  auto read = [&](InputSection<E> &isec) {
-    isec.uncompress(ctx);
-    std::string_view contents = isec.contents;
-
-    while (!contents.empty()) {
-      if (contents.size() < 14)
-        Fatal(ctx) << isec << ": corrupted header";
-
-      u32 len = *(u32 *)contents.data() + 4;
-      u32 debug_info_offset = *(u32 *)(contents.data() + 6);
-      u32 cu_idx = get_cu_idx(isec, debug_info_offset);
-
-      std::string_view data = contents.substr(14, len - 14);
-      contents = contents.substr(len);
-
-      while (!data.empty()) {
-        u32 offset = *(u32 *)data.data();
-        data = data.substr(4);
-        if (offset == 0)
-          break;
-
-        u8 type = data[0];
-        data = data.substr(1);
-
-        std::string_view name = data.data();
-        data = data.substr(name.size() + 1);
-
-        vec.push_back({name, gdb_hash(name), offset + debug_info_offset,
-                       (type << 24) | cu_idx});
-      }
-    }
-  };
-
-  if (file.debug_pubnames)
-    read(*file.debug_pubnames);
-  if (file.debug_pubtypes)
-    read(*file.debug_pubtypes);
-  return vec;
-}
-
-// Try to find a compilation unit from .debug_info and its
-// corresponding record from .debug_abbrev and returns them.
-template <typename E>
-std::pair<u8 *, u8 *>
-GdbIndexSection<E>::find_compunit(Context<E> &ctx, ObjectFile<E> &file,
-                                  i64 offset) {
-  // Read .debug_info to find the record at a given offset.
-  u8 *cu = (u8 *)(ctx.buf + ctx.debug_info->shdr.sh_offset + offset);
-  u32 dwarf_version = *(u16 *)(cu + 4);
-  u32 abbrev_offset;
-
-  switch (dwarf_version) {
-  case 4:
-    abbrev_offset = *(u32 *)(cu + 6);
-    cu += 11;
-    break;
-  case 5:
-    abbrev_offset = *(u32 *)(cu + 8);
-    cu += 12;
-    break;
-  default:
-    Fatal(ctx) << file << ": --gdb-index: unknown DWARF version "
-               << dwarf_version;
-  }
-
-  u32 abbrev_code = read_uleb(cu);
-
-  // Find a .debug_abbrev record corresponding to the .debug_info record.
-  // We assume the .debug_info record at a given offset is of
-  // DW_TAG_compile_unit which describes a compunit.
-  u8 *abbrev = (u8 *)(ctx.buf + ctx.debug_abbrev->shdr.sh_offset + abbrev_offset);
-
-  for (;;) {
-    u32 code = read_uleb(abbrev);
-    if (code == 0) {
-      Fatal(ctx) << file << ": --gdb-index: .debug_abbrev does not contain"
-                 << " a record for the first .debug_info record";
-      return {};
-    }
-
-    if (code == abbrev_code) {
-      // Found a record
-      u64 abbrev_tag = read_uleb(abbrev);
-      if (abbrev_tag != DW_TAG_compile_unit) {
-        Fatal(ctx) << file << ": --gdb-index: the first entry's tag is not "
-                   << " DW_TAG_compile_unit but 0x" << std::hex << abbrev_tag;
-        return {};
-      }
-      break;
-    }
-
-    // Skip an uninteresting record
-    for (;;) {
-      u64 name = read_uleb(abbrev);
-      u64 form = read_uleb(abbrev);
-      if (name == 0 && form == 0)
-        break;
-    }
-  }
-
-  abbrev++; // skip has_children byte
-  return {cu, abbrev};
-}
-
-// Returns a list of address ranges explained by a compunit at the
-// `offset` in an output .debug_info section.
-//
-// .debug_info contains DWARF debug info records, so this function
-// parses DWARF. If a designated compunit contains multiple ranges, the
-// ranges are read from .debug_ranges. Otherwise, a range is read
-// directly from .debug_info.
-template <typename E>
-std::vector<u64>
-GdbIndexSection<E>::read_address_areas(Context<E> &ctx, ObjectFile<E> &file,
-                                       i64 offset) {
-  u8 *cu;
-  u8 *abbrev;
-  std::tie(cu, abbrev) = find_compunit(ctx, file, offset);
-
-  std::optional<u64> low_pc;
-
-  for (;;) {
-    u64 name = read_uleb(abbrev);
-    u64 form = read_uleb(abbrev);
-    if (name == 0 && form == 0)
-      break;
-
-    auto read_value = [&]() -> u64 {
-      switch (form) {
-      case DW_FORM_flag_present:
-        return 0;
-      case DW_FORM_data1:
-      case DW_FORM_flag:
-      case DW_FORM_strx1:
-      case DW_FORM_addrx1:
-      case DW_FORM_ref1:
-        return *cu++;
-      case DW_FORM_data2:
-      case DW_FORM_strx2:
-      case DW_FORM_addrx2:
-      case DW_FORM_ref2: {
-        u64 val = *(u16 *)cu;
-        cu += 2;
-        return val;
-      }
-      case DW_FORM_data4:
-      case DW_FORM_strp:
-      case DW_FORM_sec_offset:
-      case DW_FORM_line_strp:
-      case DW_FORM_strx4:
-      case DW_FORM_addrx4:
-      case DW_FORM_ref4: {
-        u64 val = *(u32 *)cu;
-        cu += 4;
-        return val;
-      }
-      case DW_FORM_data8:
-      case DW_FORM_ref8: {
-        u64 val = *(u64 *)cu;
-        cu += 8;
-        return val;
-      }
-      case DW_FORM_addr:
-      case DW_FORM_ref_addr: {
-        u64 val = *(typename E::WordTy *)cu;
-        cu += E::word_size;
-        return val;
-      }
-      case DW_FORM_strx:
-      case DW_FORM_addrx:
-      case DW_FORM_ref_udata:
-        return read_uleb(cu);
-      case DW_FORM_string: {
-        while (*cu)
-          cu++;
-        cu++;
-        return 0;
-      }
-      default:
-        Fatal(ctx) << file << ": --gdb-index: unknown debug info form: 0x"
-                   << std::hex << form;
-        return 0;
-      }
-    };
-
-    switch (name) {
-    case DW_AT_low_pc:
-      *low_pc = read_value();
-      break;
-    case DW_AT_high_pc:
-      if (low_pc)
-        Fatal(ctx) << file << ": --gdb-index: missing DW_AT_low_pc";
-
-      if (form == DW_FORM_addr)
-        return {*low_pc, read_value()};
-      return {*low_pc, *low_pc + read_value()};
-    case DW_AT_ranges: {
-      if (!ctx.debug_ranges)
-        Fatal(ctx) << file << ": --gdb-index: missing debug_ranges";
-
-      u64 offset = read_value();
-      typename E::WordTy *range =
-        (typename E::WordTy *)(ctx.buf + ctx.debug_ranges->shdr.sh_offset + offset);
-
-      std::vector<u64> vec;
-      for (i64 i = 0; range[i] || range[i + 1]; i += 2) {
-        vec.push_back(range[i]);
-        vec.push_back(range[i + 1]);
-      }
-      return vec;
-    }
-    default:
-      read_value();
-      break;
-    }
-  }
-
-  return {};
-}
-
 template <typename E>
 GabiCompressedSection<E>::GabiCompressedSection(Context<E> &ctx,
                                                Chunk<E> &chunk) {
--- a/test/elf/gdb-index-dwarf5.sh
+++ b/test/elf/gdb-index-dwarf5.sh
@ -1,57 +0,0 @@
-#!/bin/bash
-export LC_ALL=C
-set -e
-CC="${CC:-cc}"
-CXX="${CXX:-c++}"
-GCC="${GCC:-gcc}"
-GXX="${GXX:-g++}"
-OBJDUMP="${OBJDUMP:-objdump}"
-MACHINE="${MACHINE:-$(uname -m)}"
-testname=$(basename "$0" .sh)
-echo -n "Testing $testname ... "
-cd "$(dirname "$0")"/../..
-mold="$(pwd)/mold"
-t=out/test/elf/$testname
-mkdir -p $t
-
-[ $MACHINE = $(uname -m) ] || { echo skipped; exit; }
-
-which gdb >& /dev/null || { echo skipped; exit; }
-
-echo 'int main() {}' | $CC -gdwarf-5 -o /dev/null -xc - >& /dev/null || \
-  { echo skipped; exit; }
-
-cat <<EOF > $t/a.c
-#include <stdio.h>
-
-void hello() {
-  printf("Hello world\n");
-}
-
-void greet() {
-  hello();
-}
-EOF
-
-$CC -o $t/b.o -c -ggnu-pubnames -gdwarf-5 -g $t/a.c
-$CC -o $t/c.o -c -ggnu-pubnames -gdwarf-5 -g $t/a.c -gz
-
-cat <<EOF | $CC -o $t/d.o -c -xc -ggnu-pubnames -g -
-void greet();
-
-int main() {
-  greet();
-}
-EOF
-
-$CC -B. -o $t/exe1 $t/b.o $t/d.o -Wl,--gdb-index
-$QEMU $t/exe1 | grep -q 'Hello world'
-readelf -WS $t/exe1 | fgrep -q .gdb_index
-DEBUGINFOD_URLS= gdb $t/exe1 -ex 'b main' -ex run -ex cont -ex quit >& /dev/null
-
-$CC -B. -o $t/exe2 $t/c.o $t/d.o -Wl,--gdb-index
-$QEMU $t/exe2 | grep -q 'Hello world'
-readelf -WS $t/exe2 | fgrep -q .gdb_index
-DEBUGINFOD_URLS= gdb $t/exe2 -ex 'b main' -ex run -ex cont -ex quit >& /dev/null
-
-echo OK