From 3fcb0eeb152beb4320c7632bcfa2b1e7c2e5ca00 Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Tue, 1 Dec 2020 14:45:01 -0800 Subject: [lld-macho] Emit STABS symbols for debugging, and drop debug sections Debug sections contain a large amount of data. In order not to bloat the size of the final binary, we remove them and instead emit STABS symbols for `dsymutil` and the debugger to locate their contents in the object files. With this diff, `dsymutil` is able to locate the debug info. However, we need a few more features before `lldb` is able to work well with our binaries -- e.g. having `LC_DYSYMTAB` accurately reflect the number of local symbols, emitting `LC_UUID`, and more. Those will be handled in follow-up diffs. Note also that the STABS we emit differ slightly from what ld64 does. First, we emit the path to the source file as one `N_SO` symbol instead of two. (`ld64` emits one `N_SO` for the dirname and one of the basename.) Second, we do not emit `N_BNSYM` and `N_ENSYM` STABS to mark the start and end of functions, because the `N_FUN` STABS already serve that purpose. @clayborg recommended these changes based on his knowledge of what the debugging tools look for. Additionally, this current implementation doesn't accurately reflect the size of function symbols. It uses the size of their containing sectioins as a proxy, but that is only accurate if `.subsections_with_symbols` is set, and if there isn't an `N_ALT_ENTRY` in that particular subsection. I think we have two options to solve this: 1. We can split up subsections by symbol even if `.subsections_with_symbols` is not set, but include constraints to ensure those subsections retain their order in the final output. This is `ld64`'s approach. 2. We could just add a `size` field to our `Symbol` class. This seems simpler, and I'm more inclined toward it, but I'm not sure if there are use cases that it doesn't handle well. As such I'm punting on the decision for now. Reviewed By: clayborg Differential Revision: https://reviews.llvm.org/D89257 --- lld/MachO/CMakeLists.txt | 1 + lld/MachO/Dwarf.cpp | 49 +++++++++++++++++ lld/MachO/Dwarf.h | 53 +++++++++++++++++++ lld/MachO/InputFiles.cpp | 24 +++++++++ lld/MachO/InputFiles.h | 6 +++ lld/MachO/InputSection.h | 9 +++- lld/MachO/OutputSegment.h | 1 + lld/MachO/SyntheticSections.cpp | 104 ++++++++++++++++++++++++++++++++++-- lld/MachO/SyntheticSections.h | 23 +++++++- lld/MachO/Writer.cpp | 7 ++- lld/test/MachO/stabs.s | 114 ++++++++++++++++++++++++++++++++++++++++ 11 files changed, 383 insertions(+), 8 deletions(-) create mode 100644 lld/MachO/Dwarf.cpp create mode 100644 lld/MachO/Dwarf.h create mode 100644 lld/test/MachO/stabs.s diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt index 6ddc88fb8618..6a8b5d336583 100644 --- a/lld/MachO/CMakeLists.txt +++ b/lld/MachO/CMakeLists.txt @@ -9,6 +9,7 @@ add_lld_library(lldMachO2 UnwindInfoSection.cpp Driver.cpp DriverUtils.cpp + Dwarf.cpp ExportTrie.cpp InputFiles.cpp InputSection.cpp diff --git a/lld/MachO/Dwarf.cpp b/lld/MachO/Dwarf.cpp new file mode 100644 index 000000000000..121f54fb1f79 --- /dev/null +++ b/lld/MachO/Dwarf.cpp @@ -0,0 +1,49 @@ +//===- DWARF.cpp ----------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Dwarf.h" +#include "InputFiles.h" +#include "InputSection.h" +#include "OutputSegment.h" + +#include + +using namespace lld; +using namespace lld::macho; +using namespace llvm; + +std::unique_ptr DwarfObject::create(ObjFile *obj) { + auto dObj = std::make_unique(); + bool hasDwarfInfo = false; + for (SubsectionMap subsecMap : obj->subsections) { + for (auto it : subsecMap) { + InputSection *isec = it.second; + if (!(isDebugSection(isec->flags) && + isec->segname == segment_names::dwarf)) + continue; + + if (isec->name == "__debug_info") { + dObj->infoSection.Data = toStringRef(isec->data); + hasDwarfInfo = true; + continue; + } + + if (StringRef *s = StringSwitch(isec->name) + .Case("__debug_abbrev", &dObj->abbrevSection) + .Case("__debug_str", &dObj->strSection) + .Default(nullptr)) { + *s = toStringRef(isec->data); + hasDwarfInfo = true; + } + } + } + + if (hasDwarfInfo) + return dObj; + return nullptr; +} diff --git a/lld/MachO/Dwarf.h b/lld/MachO/Dwarf.h new file mode 100644 index 000000000000..119f2778fc6b --- /dev/null +++ b/lld/MachO/Dwarf.h @@ -0,0 +1,53 @@ +//===- DWARF.h -----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------===// + +#ifndef LLD_MACHO_DWARF_H +#define LLD_MACHO_DWARF_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/DWARF/DWARFObject.h" + +namespace lld { +namespace macho { + +class ObjFile; + +// Implements the interface between LLVM's DWARF-parsing utilities and LLD's +// InputSection structures. +class DwarfObject final : public llvm::DWARFObject { +public: + bool isLittleEndian() const override { return true; } + + llvm::Optional find(const llvm::DWARFSection &sec, + uint64_t pos) const override { + // TODO: implement this + return llvm::None; + } + + void forEachInfoSections( + llvm::function_ref f) const override { + f(infoSection); + } + + llvm::StringRef getAbbrevSection() const override { return abbrevSection; } + llvm::StringRef getStrSection() const override { return strSection; } + + // Returns an instance of DwarfObject if the given object file has the + // relevant DWARF debug sections. + static std::unique_ptr create(ObjFile *); + +private: + llvm::DWARFSection infoSection; + llvm::StringRef abbrevSection; + llvm::StringRef strSection; +}; + +} // namespace macho +} // namespace lld + +#endif diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index 2f65951a49c7..921b69995d43 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -44,6 +44,7 @@ #include "InputFiles.h" #include "Config.h" #include "Driver.h" +#include "Dwarf.h" #include "ExportTrie.h" #include "InputSection.h" #include "MachOStructs.h" @@ -54,6 +55,7 @@ #include "Symbols.h" #include "Target.h" +#include "lld/Common/DWARF.h" #include "lld/Common/ErrorHandler.h" #include "lld/Common/Memory.h" #include "lld/Common/Reproduce.h" @@ -387,6 +389,28 @@ ObjFile::ObjFile(MemoryBufferRef mb) : InputFile(ObjKind, mb) { // parsed all the symbols. for (size_t i = 0, n = subsections.size(); i < n; ++i) parseRelocations(sectionHeaders[i], subsections[i]); + + parseDebugInfo(); +} + +void ObjFile::parseDebugInfo() { + std::unique_ptr dObj = DwarfObject::create(this); + if (!dObj) + return; + + auto *ctx = make( + std::move(dObj), "", + [&](Error err) { warn(getName() + ": " + toString(std::move(err))); }, + [&](Error warning) { + warn(getName() + ": " + toString(std::move(warning))); + }); + + // TODO: Since object files can contain a lot of DWARF info, we should verify + // that we are parsing just the info we need + const DWARFContext::compile_unit_range &units = ctx->compile_units(); + auto it = units.begin(); + compileUnit = it->get(); + assert(std::next(it) == units.end()); } // The path can point to either a dylib or a .tbd file. diff --git a/lld/MachO/InputFiles.h b/lld/MachO/InputFiles.h index a1405aa66ea2..4356350e7c17 100644 --- a/lld/MachO/InputFiles.h +++ b/lld/MachO/InputFiles.h @@ -15,6 +15,7 @@ #include "lld/Common/Memory.h" #include "llvm/ADT/DenseSet.h" #include "llvm/BinaryFormat/MachO.h" +#include "llvm/DebugInfo/DWARF/DWARFUnit.h" #include "llvm/Object/Archive.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/TextAPI/MachO/InterfaceFile.h" @@ -91,6 +92,11 @@ class ObjFile : public InputFile { public: explicit ObjFile(MemoryBufferRef mb); static bool classof(const InputFile *f) { return f->kind() == ObjKind; } + + llvm::DWARFUnit *compileUnit = nullptr; + +private: + void parseDebugInfo(); }; // command-line -sectcreate file diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h index 1449c87dceb5..4ef8d84bc8a0 100644 --- a/lld/MachO/InputSection.h +++ b/lld/MachO/InputSection.h @@ -35,15 +35,20 @@ struct Reloc { llvm::PointerUnion referent; }; -inline bool isZeroFill(uint8_t flags) { +inline bool isZeroFill(uint32_t flags) { return llvm::MachO::isVirtualSection(flags & llvm::MachO::SECTION_TYPE); } -inline bool isThreadLocalVariables(uint8_t flags) { +inline bool isThreadLocalVariables(uint32_t flags) { return (flags & llvm::MachO::SECTION_TYPE) == llvm::MachO::S_THREAD_LOCAL_VARIABLES; } +inline bool isDebugSection(uint32_t flags) { + return (flags & llvm::MachO::SECTION_ATTRIBUTES_USR) == + llvm::MachO::S_ATTR_DEBUG; +} + class InputSection { public: virtual ~InputSection() = default; diff --git a/lld/MachO/OutputSegment.h b/lld/MachO/OutputSegment.h index 62342370bf81..63b62d5e9109 100644 --- a/lld/MachO/OutputSegment.h +++ b/lld/MachO/OutputSegment.h @@ -23,6 +23,7 @@ constexpr const char data[] = "__DATA"; constexpr const char linkEdit[] = "__LINKEDIT"; constexpr const char dataConst[] = "__DATA_CONST"; constexpr const char ld[] = "__LD"; // output only with -r +constexpr const char dwarf[] = "__DWARF"; } // namespace segment_names diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index 5d603d6f90d3..feaa4b5d3e22 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -20,7 +20,9 @@ #include "lld/Common/ErrorHandler.h" #include "lld/Common/Memory.h" #include "llvm/Support/EndianStream.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/LEB128.h" +#include "llvm/Support/Path.h" using namespace llvm; using namespace llvm::support; @@ -574,17 +576,100 @@ SymtabSection::SymtabSection(StringTableSection &stringTableSection) stringTableSection(stringTableSection) {} uint64_t SymtabSection::getRawSize() const { - return symbols.size() * sizeof(structs::nlist_64); + return getNumSymbols() * sizeof(structs::nlist_64); +} + +void SymtabSection::emitBeginSourceStab(DWARFUnit *compileUnit) { + StabsEntry stab(MachO::N_SO); + SmallString<261> dir(compileUnit->getCompilationDir()); + StringRef sep = sys::path::get_separator(); + // We don't use `path::append` here because we want an empty `dir` to result + // in an absolute path. `append` would give us a relative path for that case. + if (!dir.endswith(sep)) + dir += sep; + stab.strx = stringTableSection.addString( + saver.save(dir + compileUnit->getUnitDIE().getShortName())); + stabs.emplace_back(std::move(stab)); +} + +void SymtabSection::emitEndSourceStab() { + StabsEntry stab(MachO::N_SO); + stab.sect = 1; + stabs.emplace_back(std::move(stab)); +} + +void SymtabSection::emitObjectFileStab(ObjFile *file) { + StabsEntry stab(MachO::N_OSO); + stab.sect = target->cpuSubtype; + SmallString<261> path(file->getName()); + std::error_code ec = sys::fs::make_absolute(path); + if (ec) + fatal("failed to get absolute path for " + file->getName()); + + stab.strx = stringTableSection.addString(saver.save(path.str())); + stab.desc = 1; + stabs.emplace_back(std::move(stab)); +} + +void SymtabSection::emitFunStabs(Defined *defined) { + { + StabsEntry stab(MachO::N_FUN); + stab.sect = 1; + stab.strx = stringTableSection.addString(defined->getName()); + stab.value = defined->getVA(); + stabs.emplace_back(std::move(stab)); + } + + { + StabsEntry stab(MachO::N_FUN); + // FIXME this should be the size of the symbol. Using the section size in + // lieu is only correct if .subsections_via_symbols is set. + stab.value = defined->isec->getSize(); + stabs.emplace_back(std::move(stab)); + } } void SymtabSection::finalizeContents() { - // TODO support other symbol types + InputFile *lastFile = nullptr; for (Symbol *sym : symtab->getSymbols()) { + // TODO support other symbol types if (isa(sym) || sym->isInGot() || sym->isInStubs()) { sym->symtabIndex = symbols.size(); symbols.push_back({sym, stringTableSection.addString(sym->getName())}); } + + // Emit STABS symbols so that dsymutil and/or the debugger can map address + // regions in the final binary to the source and object files from which + // they originated. + if (auto *defined = dyn_cast(sym)) { + if (defined->isAbsolute()) + continue; + + InputSection *isec = defined->isec; + // XXX is it right to assume that all symbols in __text are function + // symbols? + if (isec->name == "__text") { + ObjFile *file = dyn_cast(isec->file); + assert(file); + if (!file->compileUnit) + continue; + + if (lastFile == nullptr || lastFile != file) { + if (lastFile != nullptr) + emitEndSourceStab(); + lastFile = file; + + emitBeginSourceStab(file->compileUnit); + emitObjectFileStab(file); + } + emitFunStabs(defined); + } + // TODO emit stabs for non-function symbols too + } } + + if (!stabs.empty()) + emitEndSourceStab(); } void SymtabSection::writeTo(uint8_t *buf) const { @@ -602,12 +687,23 @@ void SymtabSection::writeTo(uint8_t *buf) const { nList->n_type = MachO::N_EXT | MachO::N_SECT; nList->n_sect = defined->isec->parent->index; // For the N_SECT symbol type, n_value is the address of the symbol - nList->n_value = defined->value + defined->isec->getVA(); + nList->n_value = defined->getVA(); } nList->n_desc |= defined->isWeakDef() ? MachO::N_WEAK_DEF : 0; } ++nList; } + + // Emit the stabs entries after the "real" symbols. We cannot emit them + // before as that would render Symbol::symtabIndex inaccurate. + for (const StabsEntry &entry : stabs) { + nList->n_strx = entry.strx; + nList->n_type = entry.type; + nList->n_sect = entry.sect; + nList->n_desc = entry.desc; + nList->n_value = entry.value; + ++nList; + } } IndirectSymtabSection::IndirectSymtabSection() @@ -656,7 +752,7 @@ StringTableSection::StringTableSection() uint32_t StringTableSection::addString(StringRef str) { uint32_t strx = size; - strings.push_back(str); + strings.push_back(str); // TODO: consider deduplicating strings size += str.size() + 1; // account for null terminator return strx; } diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h index 1736b6a900ca..4a8820a285a8 100644 --- a/lld/MachO/SyntheticSections.h +++ b/lld/MachO/SyntheticSections.h @@ -20,6 +20,10 @@ #include "llvm/ADT/SetVector.h" #include "llvm/Support/raw_ostream.h" +namespace llvm { +class DWARFUnit; +} // namespace llvm + namespace lld { namespace macho { @@ -48,6 +52,7 @@ constexpr const char ehFrame[] = "__eh_frame"; class Defined; class DylibSymbol; class LoadCommand; +class ObjFile; class SyntheticSection : public OutputSection { public: @@ -405,16 +410,32 @@ struct SymtabEntry { size_t strx; }; +struct StabsEntry { + uint8_t type; + uint32_t strx = 0; + uint8_t sect = 0; + uint16_t desc = 0; + uint64_t value = 0; + + explicit StabsEntry(uint8_t type) : type(type) {} +}; + class SymtabSection : public LinkEditSection { public: SymtabSection(StringTableSection &); void finalizeContents(); - size_t getNumSymbols() const { return symbols.size(); } + size_t getNumSymbols() const { return stabs.size() + symbols.size(); } uint64_t getRawSize() const override; void writeTo(uint8_t *buf) const override; private: + void emitBeginSourceStab(llvm::DWARFUnit *compileUnit); + void emitEndSourceStab(); + void emitObjectFileStab(ObjFile *); + void emitFunStabs(Defined *); + StringTableSection &stringTableSection; + std::vector stabs; std::vector symbols; }; diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index fa42c1c7e61c..e9d88af90ecc 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -578,6 +578,10 @@ void Writer::createOutputSections() { MapVector, MergedOutputSection *> mergedOutputSections; for (InputSection *isec : inputSections) { + // Instead of emitting DWARF sections, we emit STABS symbols to the object + // files that contain them. + if (isDebugSection(isec->flags) && isec->segname == segment_names::dwarf) + continue; MergedOutputSection *&osec = mergedOutputSections[{isec->segname, isec->name}]; if (osec == nullptr) @@ -591,8 +595,9 @@ void Writer::createOutputSections() { if (unwindInfoSection && segname == segment_names::ld) { assert(osec->name == section_names::compactUnwind); unwindInfoSection->setCompactUnwindSection(osec); - } else + } else { getOrCreateOutputSegment(segname)->addOutputSection(osec); + } } for (SyntheticSection *ssec : syntheticSections) { diff --git a/lld/test/MachO/stabs.s b/lld/test/MachO/stabs.s new file mode 100644 index 000000000000..5e85ccc3bc4a --- /dev/null +++ b/lld/test/MachO/stabs.s @@ -0,0 +1,114 @@ +# REQUIRES: x86 +# UNSUPPORTED: system-windows +# RUN: split-file %s %t +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/foo.s -o %t/foo.o + +# RUN: %lld -lSystem %t/test.o %t/foo.o -o %t/test +# RUN: llvm-nm -pa %t/test | FileCheck %s -DDIR=%t + +## Check that we emit absolute paths to the object files in our OSO entries +## even if our inputs are relative paths. +# RUN: cd %t && %lld -lSystem test.o foo.o -o test +# RUN: llvm-nm -pa %t/test | FileCheck %s -DDIR=%t + +# CHECK-DAG: [[#%x, MAIN:]] T _main +# CHECK-DAG: [[#%x, FOO: ]] T _foo +# CHECK: 0000000000000000 - 00 0000 SO /tmp/test.cpp +# CHECK-NEXT: 0000000000000000 - 03 0001 OSO [[DIR]]/test.o +# CHECK-NEXT: [[#MAIN]] - 01 0000 FUN _main +# CHECK-NEXT: 0000000000000001 - 00 0000 FUN +# CHECK-NEXT: 0000000000000000 - 01 0000 SO +# CHECK-NEXT: 0000000000000000 - 00 0000 SO /foo.cpp +# CHECK-NEXT: 0000000000000000 - 03 0001 OSO [[DIR]]/foo.o +# CHECK-NEXT: [[#FOO]] - 01 0000 FUN _foo +# CHECK-NEXT: 0000000000000001 - 00 0000 FUN +# CHECK-NEXT: 0000000000000000 - 01 0000 SO + +#--- test.s +.text +.globl _main +_main: +Lfunc_begin0: + retq +Lfunc_end0: + +.section __DWARF,__debug_str,regular,debug + .asciz "test.cpp" ## string offset=0 + .asciz "/tmp" ## string offset=9 +.section __DWARF,__debug_abbrev,regular,debug +Lsection_abbrev: + .byte 1 ## Abbreviation Code + .byte 17 ## DW_TAG_compile_unit + .byte 1 ## DW_CHILDREN_yes + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 27 ## DW_AT_comp_dir + .byte 14 ## DW_FORM_strp + .byte 17 ## DW_AT_low_pc + .byte 1 ## DW_FORM_addr + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 0 ## EOM(1) +.section __DWARF,__debug_info,regular,debug +.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ## Length of Unit + .long Lset0 +Ldebug_info_start0: + .short 4 ## DWARF version number +.set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section + .long Lset1 + .byte 8 ## Address Size (in bytes) + .byte 1 ## Abbrev [1] 0xb:0x48 DW_TAG_compile_unit + .long 0 ## DW_AT_name + .long 9 ## DW_AT_comp_dir + .quad Lfunc_begin0 ## DW_AT_low_pc +.set Lset3, Lfunc_end0-Lfunc_begin0 ## DW_AT_high_pc + .long Lset3 + .byte 0 ## End Of Children Mark +Ldebug_info_end0: +.subsections_via_symbols +.section __DWARF,__debug_line,regular,debug + +#--- foo.s +.text +.globl _foo +_foo: +Lfunc_begin0: + retq +Lfunc_end0: + +.section __DWARF,__debug_str,regular,debug + .asciz "foo.cpp" ## string offset=0 + .asciz "" ## string offset=8 +.section __DWARF,__debug_abbrev,regular,debug +Lsection_abbrev: + .byte 1 ## Abbreviation Code + .byte 17 ## DW_TAG_compile_unit + .byte 1 ## DW_CHILDREN_yes + .byte 3 ## DW_AT_name + .byte 14 ## DW_FORM_strp + .byte 27 ## DW_AT_comp_dir + .byte 14 ## DW_FORM_strp + .byte 17 ## DW_AT_low_pc + .byte 1 ## DW_FORM_addr + .byte 18 ## DW_AT_high_pc + .byte 6 ## DW_FORM_data4 + .byte 0 ## EOM(1) +.section __DWARF,__debug_info,regular,debug +.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ## Length of Unit + .long Lset0 +Ldebug_info_start0: + .short 4 ## DWARF version number +.set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section + .long Lset1 + .byte 8 ## Address Size (in bytes) + .byte 1 ## Abbrev [1] 0xb:0x48 DW_TAG_compile_unit + .long 0 ## DW_AT_name + .long 8 ## DW_AT_comp_dir + .quad Lfunc_begin0 ## DW_AT_low_pc +.set Lset3, Lfunc_end0-Lfunc_begin0 ## DW_AT_high_pc + .long Lset3 + .byte 0 ## End Of Children Mark +Ldebug_info_end0: +.subsections_via_symbols +.section __DWARF,__debug_line,regular,debug -- cgit v1.2.3-65-gdbad