Improved debug_lines parsing based on the stmt_list attribute of a compilation unit (#71)

fosterbrereton · web-flow · commit 45c6bf1083bb · 2024-01-11T22:37:53.000Z
diff --git a/src/dwarf.cpp b/src/dwarf.cpp
@@ -317,6 +317,14 @@ void line_header::read(freader& s, bool needs_byteswap) {
         _include_directories.push_back(cur_directory);
     }
 
+    // REVIST (fosterbrereton): The reading here isn't entirely accurate. The current code stops the
+    // first time an empty name is found, and interprets that as the end of the file names (and thus
+    // the `line_header`). However, the spec (as the end of section 6.2.4) states "A compiler may
+    // generate a single null byte for the file names field and define file names using the
+    // extended opcode DW_LNE_define_file." This loop, then, should iterate through the end of the
+    // defined size of `_header_length` instead of using an empty name as a sentry. Any additional
+    // null bytes should be interpreted as a placeholder file name description. (Admittedly, I
+    // haven't seen one of these in the wild yet.)
     while (true) {
         file_name cur_file_name;
         cur_file_name._name = s.read_c_string_view();
@@ -420,7 +428,7 @@ struct dwarf::implementation {
     std::int32_t read_sleb();
 
     void read_abbreviations();
-    void read_lines();
+    void read_lines(std::size_t header_offset);
     const abbrev& find_abbreviation(std::uint32_t code) const;
 
     pool_string read_debug_str(std::size_t offset);
@@ -519,10 +527,10 @@ void dwarf::implementation::read_abbreviations() {
 
 /**************************************************************************************************/
 
-void dwarf::implementation::read_lines() {
+void dwarf::implementation::read_lines(std::size_t header_offset) {
     ZoneScoped;
 
-    temp_seek(_s, _debug_line._offset, [&] {
+    temp_seek(_s, _debug_line._offset + header_offset, [&] {
         line_header header;
         header.read(_s, _details._needs_byteswap);
 
@@ -538,7 +546,8 @@ void dwarf::implementation::read_lines() {
             }
         }
 
-        // We don't need to process the rest of __debug__line. We're only here for the file table.
+        // We don't need to process the rest of this __debug__line subsection.
+        // We're only here for the file table.
     });
 }
 
@@ -1296,8 +1305,6 @@ bool dwarf::implementation::register_sections_done() {
 
     read_abbreviations();
 
-    read_lines();
-
     _ready = true;
 
     return true;
@@ -1433,6 +1440,21 @@ void dwarf::implementation::process_all_dies() {
 
                 continue;
             } else if (die._tag == dw::tag::compile_unit || die._tag == dw::tag::partial_unit) {
+                // Spec (section 3.1.1) says that compilation and partial units may specify which
+                // __debug_line subsection they want to draw their decl_files list from. This also
+                // means we need to clear our current decl_files list (from index 1 to the end)
+                // whenever we do hit either of these two dies. (What's the right action to take
+                // when a unit doesn't have a stmt_list attribute? Where do we get our file names
+                // from? Or is the expectation that the DWARF information won't specify any in that
+                // case?)
+
+                assert(!_decl_files.empty());
+                _decl_files.erase(std::next(_decl_files.begin()), _decl_files.end());
+
+                if (attributes.has_uint(dw::at::stmt_list)) {
+                    read_lines(attributes.uint(dw::at::stmt_list));
+                }
+
                 // REVISIT (fosterbrereton): If the name is a relative path, there may be a
                 // DW_AT_comp_dir attribute that specifies the path it is relative from.
                 // Is it worth making this path absolute?
diff --git a/src/main.cpp b/src/main.cpp
@@ -379,8 +379,8 @@ auto epilogue(bool exception) {
     if (log_level_at_least(settings::log_level::warning)) {
         cout_safe([&](auto& s) {
             s << "ORC complete.\n"
-              << "  " << g._odrv_count << " ODRVs reported\n"
-              << "  " << g._object_file_count << " compilation units processed\n"
+              << "  " << g._odrv_count << " ODRV(s) reported\n"
+              << "  " << g._object_file_count << " object file(s) processed\n"
               << "  " << g._die_processed_count << " dies processed\n"
               << "  " << g._die_skipped_count << " dies skipped (" << format_pct(g._die_skipped_count, g._die_processed_count) << ")\n"
               << "  " << g._unique_symbol_count << " unique symbols\n"