From 0c3181f6fe91b4899a75691a6ab23e4e32910497 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 20 Aug 2025 10:29:01 +0000 Subject: [PATCH 1/5] Support names field in source maps This adds support for `names` field in source maps, which contains function names. Source map mappings are correspondingly updated and emsymbolizer now can provide function name information only with source maps. To do this, you can use `emcc -gsource-map=names`. This also adds separate internal settings for this namd generation and the existing source embedding, making them more readable. Also because they are internal settings, they don't add the number of external options. When you run `wasm-sourcemap.py` standalone you can use `--names`. While we have the name sections and DWARF, I think it is generally good to support, given that the field exists for that purpose and JS source maps support it. It looks Dart toolchain also supports it: https://github.com/dart-lang/sdk/blob/187c3cb004b5f6a0a1f1b242b7d1b8a6b33b9a7a/pkg/wasm_builder/lib/source_map.dart#L105-L118 To measure source map size increase, I ran this on `wasm-opt.wasm` built by the `if (EMSCRIPTEN)` setup here (https://github.com/WebAssembly/binaryen/blob/969bf763a495b475e2a28163e7d70a5dd01f9dda/CMakeLists.txt#L299-L365) with `-gsource-map` vs. `-gsource-map=names`. The source map file size increased from 352743 to 443373, about 25%. While I think 25% increase of the source map file size is tolerable, this option is off by default, because with this we can't use #9580. So far we only needed `DW_TAG_compile_unit`s in `llvm-dwarfdump` results, and for that we could get away with printing only the top level tags using `--recurse-depth=0`. But to gather function information, we need to parse all `DW_TAG_subprogram`s, which can be at any depth (because functions can be within nested namespaces or classes). So the trick in #9580 does not work and dumping all `.debug_info` section will be slow. To avoid this problem, we can consider using DWARF-parsing Python libraries like https://github.com/eliben/pyelftools, but this will make another third party dependency, so I'm not sure if it's worth it at this point. Fixes #20715. --- emsymbolizer.py | 8 +++- src/settings_internal.js | 11 +++-- test/test_other.py | 25 ++++++---- tools/building.py | 5 +- tools/cmdline.py | 10 +++- tools/wasm-sourcemap.py | 101 +++++++++++++++++++++++++++++++++++---- 6 files changed, 135 insertions(+), 25 deletions(-) diff --git a/emsymbolizer.py b/emsymbolizer.py index 36c4c33a4cb3f..40b29b2a958d9 100755 --- a/emsymbolizer.py +++ b/emsymbolizer.py @@ -110,6 +110,7 @@ def __init__(self, source=None, line=0, column=0, func=None): def __init__(self): self.version = None self.sources = [] + self.funcs = [] self.mappings = {} self.offsets = [] @@ -121,6 +122,7 @@ def parse(self, filename): self.version = source_map_json['version'] self.sources = source_map_json['sources'] + self.funcs = source_map_json['names'] chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=' vlq_map = {c: i for i, c in enumerate(chars)} @@ -148,6 +150,7 @@ def decodeVLQ(string): src = 0 line = 1 col = 1 + func = 0 for segment in source_map_json['mappings'].split(','): data = decodeVLQ(segment) info = [] @@ -162,7 +165,9 @@ def decodeVLQ(string): if len(data) >= 4: col += data[3] info.append(col) - # TODO: see if we need the name, which is the next field (data[4]) + if len(data) == 5: + func += data[4] + info.append(func) self.mappings[offset] = WasmSourceMap.Location(*info) self.offsets.append(offset) @@ -189,6 +194,7 @@ def lookup(self, offset): self.sources[info.source] if info.source is not None else None, info.line, info.column, + self.funcs[info.func] if info.func is not None else None, ) diff --git a/src/settings_internal.js b/src/settings_internal.js index 1b0502323243e..8d9cc902f0b3d 100644 --- a/src/settings_internal.js +++ b/src/settings_internal.js @@ -194,10 +194,15 @@ var EXPECT_MAIN = true; // If true, building against Emscripten's wasm heap memory profiler. var MEMORYPROFILER = false; -// Set automatically to : -// - 1 when using `-gsource-map` -// - 2 when using `gsource-map=inline` (embed sources content in souce map) +// Source map related options. You can specify both like +// -gsource-map=inline,names +// +// -gsource-map var GENERATE_SOURCE_MAP = 0; +// -gsource-map=inline +var EMBED_SOURCE_MAP_SOURCE = 0; +// -gsource-map=names +var GENERATE_SOURCE_MAP_NAMES = 0; var GENERATE_DWARF = false; diff --git a/test/test_other.py b/test/test_other.py index 7d77af1801e3a..7d61405767c42 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -10921,17 +10921,18 @@ def check_dwarf_loc_info(address, funcs, locs): for loc in locs: self.assertIn(loc, out) - def check_source_map_loc_info(address, loc): + def check_source_map_loc_info(address, func, loc): out = self.run_process( [emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm', address], stdout=PIPE).stdout + self.assertIn(func, out) self.assertIn(loc, out) # We test two locations within test_dwarf.c: # out_to_js(0); // line 6 # __builtin_trap(); // line 13 self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-g', '-gsource-map', '-O1', '-o', 'test_dwarf.js']) + '-g', '-gsource-map=names', '-O1', '-o', 'test_dwarf.js']) # Address of out_to_js(0) within foo(), uninlined out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm') # Address of __builtin_trap() within bar(), inlined into main() @@ -10951,22 +10952,28 @@ def check_source_map_loc_info(address, loc): # 1. Test DWARF + source map together # For DWARF, we check for the full inlined info for both function names and - # source locations. Source maps provide neither function names nor inlined - # info. So we only check for the source location of the outermost function. + # source locations. Source maps does not provide inlined info. So we only + # check for the info of the outermost function. check_dwarf_loc_info(out_to_js_call_addr, out_to_js_call_func, out_to_js_call_loc) - check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0]) + check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], + out_to_js_call_loc[0]) check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) - check_source_map_loc_info(unreachable_addr, unreachable_loc[0]) + # Source map shows the original (inlined) source location with the function + # name that was inlined into + check_source_map_loc_info(unreachable_addr, unreachable_func[1], + unreachable_loc[0]) # 2. Test source map only # The addresses, function names, and source locations are the same across # the builds because they are relative offsets from the code section, so we # don't need to recompute them self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-gsource-map', '-O1', '-o', 'test_dwarf.js']) - check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_loc[0]) - check_source_map_loc_info(unreachable_addr, unreachable_loc[0]) + '-gsource-map=names', '-O1', '-o', 'test_dwarf.js']) + check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], + out_to_js_call_loc[0]) + check_source_map_loc_info(unreachable_addr, unreachable_func[1], + unreachable_loc[0]) # 3. Test DWARF only self.run_process([EMCC, test_file('core/test_dwarf.c'), diff --git a/tools/building.py b/tools/building.py index 05d8a0e7ebad3..748e9d8fb8a3c 100644 --- a/tools/building.py +++ b/tools/building.py @@ -1145,9 +1145,10 @@ def emit_wasm_source_map(wasm_file, map_file, final_wasm): if settings.SOURCE_MAP_PREFIXES: sourcemap_cmd += ['--prefix', *settings.SOURCE_MAP_PREFIXES] - - if settings.GENERATE_SOURCE_MAP == 2: + if settings.EMBED_SOURCE_MAP_SOURCE: sourcemap_cmd += ['--sources'] + if settings.GENERATE_SOURCE_MAP_NAMES: + sourcemap_cmd += ['--names'] check_call(sourcemap_cmd) diff --git a/tools/cmdline.py b/tools/cmdline.py index 28621175d8d36..353ed21c64a3f 100644 --- a/tools/cmdline.py +++ b/tools/cmdline.py @@ -394,10 +394,16 @@ def consume_arg_file(): else: settings.SEPARATE_DWARF = True settings.GENERATE_DWARF = 1 - elif requested_level in ['source-map', 'source-map=inline']: - settings.GENERATE_SOURCE_MAP = 1 if requested_level == 'source-map' else 2 + elif requested_level.startswith('source-map'): + settings.GENERATE_SOURCE_MAP = 1 settings.EMIT_NAME_SECTION = 1 newargs[i] = '-g' + if '=' in requested_level: + source_map_options = requested_level.split('=')[1].split(',') + if 'inline' in source_map_options: + settings.EMBED_SOURCE_MAP_SOURCE = 1 + if 'names' in source_map_options: + settings.GENERATE_SOURCE_MAP_NAMES = 1 elif requested_level == 'z': # Ignore `-gz`. We don't support debug info compression. continue diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index e9f39bd591110..cc5bc08361c38 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -11,6 +11,7 @@ """ import argparse +import bisect import json import logging from math import floor, log @@ -46,6 +47,7 @@ def parse_args(): parser.add_argument('--dwarfdump', help="path to llvm-dwarfdump executable") parser.add_argument('--dwarfdump-output', nargs='?', help=argparse.SUPPRESS) parser.add_argument('--basepath', help='base path for source files, which will be relative to this') + parser.add_argument('--names', action='store_true', help='Support function names in names field') return parser.parse_args() @@ -232,7 +234,48 @@ def extract_comp_dir_map(text): return map_stmt_list_to_comp_dir -def read_dwarf_entries(wasm, options): +# This function parses DW_TAG_subprogram entries and gets low_pc and high_pc for +# each function in a list of +# [((low_pc0, high_pc0), func0), ((low_pc1, high_pc1), func1), ... ] +# The result list will be sorted in the increasing order of low_pcs. +def extract_func_ranges(text): + # The example of a DW_TAG_subprogram is + # + # 0x000000ba: DW_TAG_subprogram + # DW_AT_low_pc (0x0000005f) + # DW_AT_high_pc (0x00000071) + # DW_AT_frame_base (DW_OP_WASM_location 0x3 0x0, DW_OP_stack_value) + # DW_AT_name ("foo") + # DW_AT_decl_file ("../foo.c") + # DW_AT_decl_line (3) + # DW_AT_external (true) + # + # This parses the value of DW_AT_low_pc, DW_AT_high_pc, and DW_AT_name + # attributes. + func_ranges = [] + dw_tags = re.split(r'\r?\n(?=0x[0-9a-f]+:)', text) + for tag in dw_tags: + if re.search(r"0x[0-9a-f]+:\s+DW_TAG_subprogram", tag): + name = None + low_pc = None + high_pc = None + m = re.search(r'DW_AT_low_pc\s+\(0x([0-9a-f]+)\)', tag) + if m: + low_pc = int(m.group(1), 16) + m = re.search(r'DW_AT_high_pc\s+\(0x([0-9a-f]+)\)', tag) + if m: + high_pc = int(m.group(1), 16) + m = re.search(r'DW_AT_name\s+\("([^"]+)"\)', tag) + if m: + name = m.group(1) + if name and low_pc and high_pc: + func_ranges.append(((low_pc, high_pc), name)) + # Sort the list based on low_pcs + func_ranges = sorted(func_ranges, key=lambda item: item[0][0]) + return func_ranges + + +def read_dwarf_info(wasm, options): if options.dwarfdump_output: output = Path(options.dwarfdump_output).read_bytes() elif options.dwarfdump: @@ -240,7 +283,13 @@ def read_dwarf_entries(wasm, options): if not os.path.exists(options.dwarfdump): logger.error('llvm-dwarfdump not found: ' + options.dwarfdump) sys.exit(1) - process = Popen([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=PIPE) + # --recurse-depth=0 only prints 'DW_TAG_compile_unit's, reducing the size of + # text output. But to support function names info, we need + # 'DW_TAG_subprogram's, which can be at any depth. + dwarfdump_cmd = [options.dwarfdump, '-debug-info', '-debug-line', wasm] + if not options.names: + dwarfdump_cmd.append('--recurse-depth=0') + process = Popen(dwarfdump_cmd, stdout=PIPE) output, err = process.communicate() exit_code = process.wait() if exit_code != 0: @@ -297,22 +346,52 @@ def read_dwarf_entries(wasm, options): remove_dead_entries(entries) # return entries sorted by the address field - return sorted(entries, key=lambda entry: entry['address']) + prev_entries = entries.copy() + entries = sorted(entries, key=lambda entry: entry['address']) + func_ranges = [] + if options.names: + func_ranges = extract_func_ranges(debug_line_chunks[0]) + return entries, func_ranges -def build_sourcemap(entries, code_section_offset, options): + +def build_sourcemap(entries, func_ranges, code_section_offset, options): base_path = options.basepath collect_sources = options.sources prefixes = SourceMapPrefixes(options.prefix, options.load_prefix, base_path) + # Add code section offset to the low/high pc in the function PC ranges + if options.names: + for i in range(len(func_ranges)): + (low_pc, high_pc), name = func_ranges[i] + func_ranges[i] = ((low_pc + code_section_offset), (high_pc + code_section_offset)), name + func_low_pcs = [item[0][0] for item in func_ranges] + sources = [] sources_content = [] + names = [item[1] for item in func_ranges] mappings = [] sources_map = {} last_address = 0 last_source_id = 0 last_line = 1 last_column = 1 + last_func_id = 0 + + # Get the function ID that the given address falls into + def get_function_id(func_ranges, addr): + if not options.names: + return None + index = bisect.bisect_right(func_low_pcs, address) + if index == 0: # The address is lower than the first function's start + return None + candidate_index = index - 1 + (low_pc, high_pc), name = func_ranges[candidate_index] + # Check the address within the candidate's [low_pc, high_pc) range. If not, + # it is in a gap between functions. + if low_pc <= address < high_pc: + return candidate_index + return None for entry in entries: line = entry['line'] @@ -343,21 +422,27 @@ def build_sourcemap(entries, code_section_offset, options): sources_content.append(None) else: source_id = sources_map[source_name] + func_id = get_function_id(func_ranges, address) address_delta = address - last_address source_id_delta = source_id - last_source_id line_delta = line - last_line column_delta = column - last_column - mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta)) last_address = address last_source_id = source_id last_line = line last_column = column + mapping = encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta) + if func_id is not None: + func_id_delta = func_id - last_func_id + last_func_id = func_id + mapping += encode_vlq(func_id_delta) + mappings.append(mapping) return {'version': 3, 'sources': sources, 'sourcesContent': sources_content, - 'names': [], + 'names': names, 'mappings': ','.join(mappings)} @@ -368,12 +453,12 @@ def main(): with open(wasm_input, 'rb') as infile: wasm = infile.read() - entries = read_dwarf_entries(wasm_input, options) + entries, func_ranges = read_dwarf_info(wasm_input, options) code_section_offset = get_code_section_offset(wasm) logger.debug('Saving to %s' % options.output) - map = build_sourcemap(entries, code_section_offset, options) + map = build_sourcemap(entries, func_ranges, code_section_offset, options) with open(options.output, 'w', encoding='utf-8') as outfile: json.dump(map, outfile, separators=(',', ':'), ensure_ascii=False) From a0421214a0c92e9d011122f4128567cf6322312c Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 25 Aug 2025 17:51:47 +0000 Subject: [PATCH 2/5] Fix ruff --- tools/wasm-sourcemap.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index cc5bc08361c38..f32b10db5032a 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -346,7 +346,6 @@ def read_dwarf_info(wasm, options): remove_dead_entries(entries) # return entries sorted by the address field - prev_entries = entries.copy() entries = sorted(entries, key=lambda entry: entry['address']) func_ranges = [] @@ -379,7 +378,7 @@ def build_sourcemap(entries, func_ranges, code_section_offset, options): last_func_id = 0 # Get the function ID that the given address falls into - def get_function_id(func_ranges, addr): + def get_function_id(address): if not options.names: return None index = bisect.bisect_right(func_low_pcs, address) @@ -422,7 +421,7 @@ def get_function_id(func_ranges, addr): sources_content.append(None) else: source_id = sources_map[source_name] - func_id = get_function_id(func_ranges, address) + func_id = get_function_id(address) address_delta = address - last_address source_id_delta = source_id - last_source_id From a5ae4fa7f500a8c28ed7709653447ec6a23596e4 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 25 Aug 2025 21:55:58 +0000 Subject: [PATCH 3/5] Remove -gsourcemap=names option --- src/settings_internal.js | 11 +++-------- test/test_other.py | 4 ++-- tools/building.py | 5 ++--- tools/cmdline.py | 10 ++-------- tools/wasm-sourcemap.py | 13 ++++--------- 5 files changed, 13 insertions(+), 30 deletions(-) diff --git a/src/settings_internal.js b/src/settings_internal.js index 8d9cc902f0b3d..1b0502323243e 100644 --- a/src/settings_internal.js +++ b/src/settings_internal.js @@ -194,15 +194,10 @@ var EXPECT_MAIN = true; // If true, building against Emscripten's wasm heap memory profiler. var MEMORYPROFILER = false; -// Source map related options. You can specify both like -// -gsource-map=inline,names -// -// -gsource-map +// Set automatically to : +// - 1 when using `-gsource-map` +// - 2 when using `gsource-map=inline` (embed sources content in souce map) var GENERATE_SOURCE_MAP = 0; -// -gsource-map=inline -var EMBED_SOURCE_MAP_SOURCE = 0; -// -gsource-map=names -var GENERATE_SOURCE_MAP_NAMES = 0; var GENERATE_DWARF = false; diff --git a/test/test_other.py b/test/test_other.py index ebe385e2ecd20..b8c78db49bf0e 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -10928,7 +10928,7 @@ def check_source_map_loc_info(address, func, loc): # out_to_js(0); // line 6 # __builtin_trap(); // line 13 self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-g', '-gsource-map=names', '-O1', '-o', 'test_dwarf.js']) + '-g', '-gsource-map', '-O1', '-o', 'test_dwarf.js']) # Address of out_to_js(0) within foo(), uninlined out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm') # Address of __builtin_trap() within bar(), inlined into main() @@ -10965,7 +10965,7 @@ def check_source_map_loc_info(address, func, loc): # the builds because they are relative offsets from the code section, so we # don't need to recompute them self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-gsource-map=names', '-O1', '-o', 'test_dwarf.js']) + '-gsource-map', '-O1', '-o', 'test_dwarf.js']) check_source_map_loc_info(out_to_js_call_addr, out_to_js_call_func[0], out_to_js_call_loc[0]) check_source_map_loc_info(unreachable_addr, unreachable_func[1], diff --git a/tools/building.py b/tools/building.py index 748e9d8fb8a3c..05d8a0e7ebad3 100644 --- a/tools/building.py +++ b/tools/building.py @@ -1145,10 +1145,9 @@ def emit_wasm_source_map(wasm_file, map_file, final_wasm): if settings.SOURCE_MAP_PREFIXES: sourcemap_cmd += ['--prefix', *settings.SOURCE_MAP_PREFIXES] - if settings.EMBED_SOURCE_MAP_SOURCE: + + if settings.GENERATE_SOURCE_MAP == 2: sourcemap_cmd += ['--sources'] - if settings.GENERATE_SOURCE_MAP_NAMES: - sourcemap_cmd += ['--names'] check_call(sourcemap_cmd) diff --git a/tools/cmdline.py b/tools/cmdline.py index 353ed21c64a3f..28621175d8d36 100644 --- a/tools/cmdline.py +++ b/tools/cmdline.py @@ -394,16 +394,10 @@ def consume_arg_file(): else: settings.SEPARATE_DWARF = True settings.GENERATE_DWARF = 1 - elif requested_level.startswith('source-map'): - settings.GENERATE_SOURCE_MAP = 1 + elif requested_level in ['source-map', 'source-map=inline']: + settings.GENERATE_SOURCE_MAP = 1 if requested_level == 'source-map' else 2 settings.EMIT_NAME_SECTION = 1 newargs[i] = '-g' - if '=' in requested_level: - source_map_options = requested_level.split('=')[1].split(',') - if 'inline' in source_map_options: - settings.EMBED_SOURCE_MAP_SOURCE = 1 - if 'names' in source_map_options: - settings.GENERATE_SOURCE_MAP_NAMES = 1 elif requested_level == 'z': # Ignore `-gz`. We don't support debug info compression. continue diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index f32b10db5032a..57768acaba360 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -47,7 +47,6 @@ def parse_args(): parser.add_argument('--dwarfdump', help="path to llvm-dwarfdump executable") parser.add_argument('--dwarfdump-output', nargs='?', help=argparse.SUPPRESS) parser.add_argument('--basepath', help='base path for source files, which will be relative to this') - parser.add_argument('--names', action='store_true', help='Support function names in names field') return parser.parse_args() @@ -359,12 +358,10 @@ def build_sourcemap(entries, func_ranges, code_section_offset, options): collect_sources = options.sources prefixes = SourceMapPrefixes(options.prefix, options.load_prefix, base_path) - # Add code section offset to the low/high pc in the function PC ranges - if options.names: - for i in range(len(func_ranges)): - (low_pc, high_pc), name = func_ranges[i] - func_ranges[i] = ((low_pc + code_section_offset), (high_pc + code_section_offset)), name - func_low_pcs = [item[0][0] for item in func_ranges] + for i in range(len(func_ranges)): + (low_pc, high_pc), name = func_ranges[i] + func_ranges[i] = ((low_pc + code_section_offset), (high_pc + code_section_offset)), name + func_low_pcs = [item[0][0] for item in func_ranges] sources = [] sources_content = [] @@ -379,8 +376,6 @@ def build_sourcemap(entries, func_ranges, code_section_offset, options): # Get the function ID that the given address falls into def get_function_id(address): - if not options.names: - return None index = bisect.bisect_right(func_low_pcs, address) if index == 0: # The address is lower than the first function's start return None From 4c1986df3bb5f2700fa56f2fa463e881790cd1ee Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 25 Aug 2025 21:58:00 +0000 Subject: [PATCH 4/5] Revert changes to read_dwarf_entries --- tools/wasm-sourcemap.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index 57768acaba360..044aa3cc86c47 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -274,7 +274,7 @@ def extract_func_ranges(text): return func_ranges -def read_dwarf_info(wasm, options): +def read_dwarf_entries(wasm, options): if options.dwarfdump_output: output = Path(options.dwarfdump_output).read_bytes() elif options.dwarfdump: @@ -282,13 +282,7 @@ def read_dwarf_info(wasm, options): if not os.path.exists(options.dwarfdump): logger.error('llvm-dwarfdump not found: ' + options.dwarfdump) sys.exit(1) - # --recurse-depth=0 only prints 'DW_TAG_compile_unit's, reducing the size of - # text output. But to support function names info, we need - # 'DW_TAG_subprogram's, which can be at any depth. - dwarfdump_cmd = [options.dwarfdump, '-debug-info', '-debug-line', wasm] - if not options.names: - dwarfdump_cmd.append('--recurse-depth=0') - process = Popen(dwarfdump_cmd, stdout=PIPE) + process = Popen([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=PIPE) output, err = process.communicate() exit_code = process.wait() if exit_code != 0: @@ -345,12 +339,7 @@ def read_dwarf_info(wasm, options): remove_dead_entries(entries) # return entries sorted by the address field - entries = sorted(entries, key=lambda entry: entry['address']) - - func_ranges = [] - if options.names: - func_ranges = extract_func_ranges(debug_line_chunks[0]) - return entries, func_ranges + return sorted(entries, key=lambda entry: entry['address']) def build_sourcemap(entries, func_ranges, code_section_offset, options): @@ -447,7 +436,7 @@ def main(): with open(wasm_input, 'rb') as infile: wasm = infile.read() - entries, func_ranges = read_dwarf_info(wasm_input, options) + entries = read_dwarf_entries(wasm_input, options) code_section_offset = get_code_section_offset(wasm) From cf59bd6500b5ceb210b50c266aef0f3a486241aa Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 26 Aug 2025 06:50:13 +0000 Subject: [PATCH 5/5] Use name section for func info --- tools/wasm-sourcemap.py | 73 +++++++++++++++-------------------------- tools/webassembly.py | 28 ++++++++++++++++ 2 files changed, 54 insertions(+), 47 deletions(-) diff --git a/tools/wasm-sourcemap.py b/tools/wasm-sourcemap.py index 044aa3cc86c47..df03d3e2488ef 100755 --- a/tools/wasm-sourcemap.py +++ b/tools/wasm-sourcemap.py @@ -28,6 +28,7 @@ from tools import utils from tools.system_libs import DETERMINISTIC_PREFIX from tools.shared import path_from_root +from tools import webassembly EMSCRIPTEN_PREFIX = utils.normalize_path(path_from_root()) @@ -233,47 +234,6 @@ def extract_comp_dir_map(text): return map_stmt_list_to_comp_dir -# This function parses DW_TAG_subprogram entries and gets low_pc and high_pc for -# each function in a list of -# [((low_pc0, high_pc0), func0), ((low_pc1, high_pc1), func1), ... ] -# The result list will be sorted in the increasing order of low_pcs. -def extract_func_ranges(text): - # The example of a DW_TAG_subprogram is - # - # 0x000000ba: DW_TAG_subprogram - # DW_AT_low_pc (0x0000005f) - # DW_AT_high_pc (0x00000071) - # DW_AT_frame_base (DW_OP_WASM_location 0x3 0x0, DW_OP_stack_value) - # DW_AT_name ("foo") - # DW_AT_decl_file ("../foo.c") - # DW_AT_decl_line (3) - # DW_AT_external (true) - # - # This parses the value of DW_AT_low_pc, DW_AT_high_pc, and DW_AT_name - # attributes. - func_ranges = [] - dw_tags = re.split(r'\r?\n(?=0x[0-9a-f]+:)', text) - for tag in dw_tags: - if re.search(r"0x[0-9a-f]+:\s+DW_TAG_subprogram", tag): - name = None - low_pc = None - high_pc = None - m = re.search(r'DW_AT_low_pc\s+\(0x([0-9a-f]+)\)', tag) - if m: - low_pc = int(m.group(1), 16) - m = re.search(r'DW_AT_high_pc\s+\(0x([0-9a-f]+)\)', tag) - if m: - high_pc = int(m.group(1), 16) - m = re.search(r'DW_AT_name\s+\("([^"]+)"\)', tag) - if m: - name = m.group(1) - if name and low_pc and high_pc: - func_ranges.append(((low_pc, high_pc), name)) - # Sort the list based on low_pcs - func_ranges = sorted(func_ranges, key=lambda item: item[0][0]) - return func_ranges - - def read_dwarf_entries(wasm, options): if options.dwarfdump_output: output = Path(options.dwarfdump_output).read_bytes() @@ -342,19 +302,35 @@ def read_dwarf_entries(wasm, options): return sorted(entries, key=lambda entry: entry['address']) +def read_func_ranges(wasm_input): + with webassembly.Module(wasm_input) as module: + if not module.has_name_section(): + return [] + funcs = module.get_functions() + func_names = module.get_function_names()[module.num_imported_funcs():] + assert len(funcs) == len(func_names) + + # Replace '__original_main' with 'main' + try: + original_main_index = func_names.index('__original_main') + func_names[original_main_index] = 'main' + except ValueError: + pass + + func_ranges = [(n, (f.offset, f.offset + f.size)) for n, f in zip(func_names, funcs)] + return func_ranges + + def build_sourcemap(entries, func_ranges, code_section_offset, options): base_path = options.basepath collect_sources = options.sources prefixes = SourceMapPrefixes(options.prefix, options.load_prefix, base_path) - for i in range(len(func_ranges)): - (low_pc, high_pc), name = func_ranges[i] - func_ranges[i] = ((low_pc + code_section_offset), (high_pc + code_section_offset)), name - func_low_pcs = [item[0][0] for item in func_ranges] + func_low_pcs = [item[1][0] for item in func_ranges] sources = [] sources_content = [] - names = [item[1] for item in func_ranges] + names = [item[0] for item in func_ranges] mappings = [] sources_map = {} last_address = 0 @@ -365,11 +341,13 @@ def build_sourcemap(entries, func_ranges, code_section_offset, options): # Get the function ID that the given address falls into def get_function_id(address): + if not func_ranges: + return None index = bisect.bisect_right(func_low_pcs, address) if index == 0: # The address is lower than the first function's start return None candidate_index = index - 1 - (low_pc, high_pc), name = func_ranges[candidate_index] + name, (low_pc, high_pc) = func_ranges[candidate_index] # Check the address within the candidate's [low_pc, high_pc) range. If not, # it is in a gap between functions. if low_pc <= address < high_pc: @@ -437,6 +415,7 @@ def main(): wasm = infile.read() entries = read_dwarf_entries(wasm_input, options) + func_ranges = read_func_ranges(wasm_input) code_section_offset = get_code_section_offset(wasm) diff --git a/tools/webassembly.py b/tools/webassembly.py index 0a9fdf61a975b..6c864eb8b2303 100644 --- a/tools/webassembly.py +++ b/tools/webassembly.py @@ -522,6 +522,34 @@ def get_function_types(self): def has_name_section(self): return self.get_custom_section('name') is not None + @memoize + def get_function_names(self): + num_funcs = self.num_imported_funcs() + len(self.get_functions()) + names = [None] * num_funcs + + name_section = self.get_custom_section('name') + if not name_section: + return names + + self.seek(name_section.offset) + self.read_string() # section name + section_end = name_section.offset + name_section.size + + while self.tell() < section_end: + subsection_id = self.read_byte() + subsection_size = self.read_uleb() + if subsection_id == 1: # function names + count = self.read_uleb() + for _ in range(count): + func_idx = self.read_uleb() + func_name = self.read_string() + assert func_idx < len(names) + names[func_idx] = func_name + else: + self.skip(subsection_size) + + return names + @once def _calc_indexes(self): self.imports_by_kind = {}