From a203fdb1b62ce2df855421d46905f8b821eeabbd Mon Sep 17 00:00:00 2001 From: Justin Ridgewell Date: Sun, 4 Dec 2022 00:52:00 -0500 Subject: [PATCH] fix(common): Fix `BytePos` -> `CharPos` calculations (#6574) **Description:** This fixes the BytePos -> CharPos calculation necessary for source maps. There were a few issues in the old code: 1. UTF-8 maps 1-3 bytes into 1 UTF-16 char, but 4 bytes into 2 UTF-16 chars 2. The starting offset was not recorded when we reached the end of the `multibyte_chars` iteration 3. The `mappings` can be unordered, meaning we need to restart the UTF-16 offset calculation **Related issue:** - Closes https://github.com/swc-project/swc/issues/6552. --- .../issue-6552/input-map/input/.swcrc | 20 ++ .../issue-6552/input-map/input/index.js | 4 + .../issue-6552/input-map/output/index.js | 1 + .../issue-6552/input-map/output/index.map | 17 ++ .../sourcemap/issue-6552/no-map/input/.swcrc | 20 ++ .../issue-6552/no-map/input/index.js | 4 + .../issue-6552/no-map/output/index.js | 1 + .../issue-6552/no-map/output/index.map | 17 ++ crates/swc_common/src/source_map.rs | 189 +++++++++++++----- crates/swc_common/src/syntax_pos.rs | 19 +- crates/swc_estree_compat/src/babelify/mod.rs | 16 +- 11 files changed, 245 insertions(+), 63 deletions(-) create mode 100644 crates/swc/tests/fixture/sourcemap/issue-6552/input-map/input/.swcrc create mode 100644 crates/swc/tests/fixture/sourcemap/issue-6552/input-map/input/index.js create mode 100644 crates/swc/tests/fixture/sourcemap/issue-6552/input-map/output/index.js create mode 100644 crates/swc/tests/fixture/sourcemap/issue-6552/input-map/output/index.map create mode 100644 crates/swc/tests/fixture/sourcemap/issue-6552/no-map/input/.swcrc create mode 100644 crates/swc/tests/fixture/sourcemap/issue-6552/no-map/input/index.js create mode 100644 crates/swc/tests/fixture/sourcemap/issue-6552/no-map/output/index.js create mode 100644 crates/swc/tests/fixture/sourcemap/issue-6552/no-map/output/index.map diff --git a/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/input/.swcrc b/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/input/.swcrc new file mode 100644 index 00000000000..84bd4c6fcaa --- /dev/null +++ b/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/input/.swcrc @@ -0,0 +1,20 @@ +{ + "sourceMaps": true, + "jsc": { + "parser": { + "syntax": "ecmascript", + "jsx": false + }, + "target": "es5", + "loose": false, + "minify": { + "compress": false, + "mangle": false + } + }, + "module": { + "type": "commonjs" + }, + "minify": true, + "isModule": true +} diff --git a/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/input/index.js b/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/input/index.js new file mode 100644 index 00000000000..5efd35bfca9 --- /dev/null +++ b/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/input/index.js @@ -0,0 +1,4 @@ +const xxx = ', something'; +console.error(`❌ ${message}`); +const bbb = ''; +//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJuYW1lcyI6WyJ4eHgiLCJjb25zb2xlIiwiZXJyb3IiLCJtZXNzYWdlIiwiYmJiIl0sInNvdXJjZXMiOlsidW5rbm93biJdLCJzb3VyY2VzQ29udGVudCI6WyJjb25zdCB4eHggPSAnLCBzb21ldGhpbmcnXG5jb25zb2xlLmVycm9yKGDinYwgJHttZXNzYWdlfWApO1xuXG5jb25zdCBiYmIgPSAnJ1xuIl0sIm1hcHBpbmdzIjoiQUFBQSxNQUFNQSxHQUFHLEdBQUcsYUFBWjtBQUNBQyxPQUFPLENBQUNDLEtBQVIsQ0FBZSxLQUFJQyxPQUFRLEVBQTNCO0FBRUEsTUFBTUMsR0FBRyxHQUFHLEVBQVoifQ== diff --git a/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/output/index.js b/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/output/index.js new file mode 100644 index 00000000000..06857c462ea --- /dev/null +++ b/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/output/index.js @@ -0,0 +1 @@ +"use strict";var xxx=", something";console.error("❌ ".concat(message));var bbb=""; diff --git a/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/output/index.map b/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/output/index.map new file mode 100644 index 00000000000..ceabf60c914 --- /dev/null +++ b/crates/swc/tests/fixture/sourcemap/issue-6552/input-map/output/index.map @@ -0,0 +1,17 @@ +{ + "mappings": "AAAA,aAAA,IAAMA,IAAM,cACZC,QAAQC,KAAK,CAAC,AAAC,KAAY,OAARC,UACnB,IAAMC,IAAM", + "names": [ + "xxx", + "console", + "error", + "message", + "bbb" + ], + "sources": [ + "../../input/index.js" + ], + "sourcesContent": [ + "const xxx = ', something';\nconsole.error(`❌ ${message}`);\nconst bbb = '';\n//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJuYW1lcyI6WyJ4eHgiLCJjb25zb2xlIiwiZXJyb3IiLCJtZXNzYWdlIiwiYmJiIl0sInNvdXJjZXMiOlsidW5rbm93biJdLCJzb3VyY2VzQ29udGVudCI6WyJjb25zdCB4eHggPSAnLCBzb21ldGhpbmcnXG5jb25zb2xlLmVycm9yKGDinYwgJHttZXNzYWdlfWApO1xuXG5jb25zdCBiYmIgPSAnJ1xuIl0sIm1hcHBpbmdzIjoiQUFBQSxNQUFNQSxHQUFHLEdBQUcsYUFBWjtBQUNBQyxPQUFPLENBQUNDLEtBQVIsQ0FBZSxLQUFJQyxPQUFRLEVBQTNCO0FBRUEsTUFBTUMsR0FBRyxHQUFHLEVBQVoifQ==\n" + ], + "version": 3 +} diff --git a/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/input/.swcrc b/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/input/.swcrc new file mode 100644 index 00000000000..84bd4c6fcaa --- /dev/null +++ b/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/input/.swcrc @@ -0,0 +1,20 @@ +{ + "sourceMaps": true, + "jsc": { + "parser": { + "syntax": "ecmascript", + "jsx": false + }, + "target": "es5", + "loose": false, + "minify": { + "compress": false, + "mangle": false + } + }, + "module": { + "type": "commonjs" + }, + "minify": true, + "isModule": true +} diff --git a/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/input/index.js b/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/input/index.js new file mode 100644 index 00000000000..d3eeaa6cd51 --- /dev/null +++ b/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/input/index.js @@ -0,0 +1,4 @@ +const xxx = ', something' +console.error(`❌ ${message}`); + +const bbb = '' diff --git a/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/output/index.js b/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/output/index.js new file mode 100644 index 00000000000..06857c462ea --- /dev/null +++ b/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/output/index.js @@ -0,0 +1 @@ +"use strict";var xxx=", something";console.error("❌ ".concat(message));var bbb=""; diff --git a/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/output/index.map b/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/output/index.map new file mode 100644 index 00000000000..7f2baf65599 --- /dev/null +++ b/crates/swc/tests/fixture/sourcemap/issue-6552/no-map/output/index.map @@ -0,0 +1,17 @@ +{ + "mappings": "AAAA,aAAA,IAAMA,IAAM,cACZC,QAAQC,KAAK,CAAC,AAAC,KAAY,OAARC,UAEnB,IAAMC,IAAM", + "names": [ + "xxx", + "console", + "error", + "message", + "bbb" + ], + "sources": [ + "../../input/index.js" + ], + "sourcesContent": [ + "const xxx = ', something'\nconsole.error(`❌ ${message}`);\n\nconst bbb = ''\n" + ], + "version": 3 +} diff --git a/crates/swc_common/src/source_map.rs b/crates/swc_common/src/source_map.rs index 7fc14f8bb67..8029b6c57c6 100644 --- a/crates/swc_common/src/source_map.rs +++ b/crates/swc_common/src/source_map.rs @@ -17,9 +17,7 @@ //! within the SourceMap, which upon request can be converted to line and column //! information, source code snippets, etc. use std::{ - cmp, - cmp::{max, min}, - env, fs, + cmp, env, fs, hash::Hash, io, path::{Path, PathBuf}, @@ -295,8 +293,7 @@ impl SourceMap { ); let linechpos = self.bytepos_to_file_charpos_with(&f, linebpos); - - let col = max(chpos, linechpos) - min(chpos, linechpos); + let col = chpos - linechpos; let col_display = { let start_width_idx = f @@ -954,7 +951,7 @@ impl SourceMap { } fn bytepos_to_file_charpos_with(&self, map: &SourceFile, bpos: BytePos) -> CharPos { - let total_extra_bytes = self.calc_extra_bytes(map, &mut 0, &mut 0, bpos); + let total_extra_bytes = self.calc_utf16_offset(map, bpos, &mut Default::default()); assert!( map.start_pos.to_u32() + total_extra_bytes <= bpos.to_u32(), "map.start_pos = {:?}; total_extra_bytes = {}; bpos = {:?}", @@ -965,23 +962,43 @@ impl SourceMap { CharPos(bpos.to_usize() - map.start_pos.to_usize() - total_extra_bytes as usize) } - /// Converts an absolute BytePos to a CharPos relative to the source_file. - fn calc_extra_bytes( - &self, - map: &SourceFile, - prev_total_extra_bytes: &mut u32, - start: &mut usize, - bpos: BytePos, - ) -> u32 { - // The number of extra bytes due to multibyte chars in the SourceFile - let mut total_extra_bytes = *prev_total_extra_bytes; + /// Converts a span of absolute BytePos to a CharPos relative to the + /// source_file. + pub fn span_to_char_offset(&self, file: &SourceFile, span: Span) -> (u32, u32) { + // We rename this to feel more comfortable while doing math. + let start_offset = file.start_pos; - for (i, &mbc) in map.multibyte_chars[*start..].iter().enumerate() { - debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos); - if mbc.pos < bpos { - // every character is at least one byte, so we only - // count the actual extra bytes. - total_extra_bytes += mbc.bytes as u32 - 1; + let mut state = ByteToCharPosState::default(); + let start = span.lo.to_u32() + - start_offset.to_u32() + - self.calc_utf16_offset(file, span.lo, &mut state); + let end = span.hi.to_u32() + - start_offset.to_u32() + - self.calc_utf16_offset(file, span.hi, &mut state); + + (start, end) + } + + /// Calculates the number of excess chars seen in the UTF-8 encoding of a + /// file compared with the UTF-16 encoding. + fn calc_utf16_offset( + &self, + file: &SourceFile, + bpos: BytePos, + state: &mut ByteToCharPosState, + ) -> u32 { + let mut total_extra_bytes = state.total_extra_bytes; + let mut index = state.mbc_index; + + if bpos >= state.pos { + let range = index..file.multibyte_chars.len(); + for i in range { + let mbc = &file.multibyte_chars[i]; + debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos); + if mbc.pos >= bpos { + break; + } + total_extra_bytes += mbc.byte_to_char_diff() as u32; // We should never see a byte position in the middle of a // character debug_assert!( @@ -991,13 +1008,32 @@ impl SourceMap { mbc.pos, mbc.bytes ); - } else { - *start += i; - break; + index += 1; + } + } else { + let range = 0..index; + for i in range.rev() { + let mbc = &file.multibyte_chars[i]; + debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos); + if mbc.pos < bpos { + break; + } + total_extra_bytes -= mbc.byte_to_char_diff() as u32; + // We should never see a byte position in the middle of a + // character + debug_assert!( + bpos.to_u32() <= mbc.pos.to_u32(), + "bpos = {:?}, mbc.pos = {:?}", + bpos, + mbc.pos, + ); + index -= 1; } } - *prev_total_extra_bytes = total_extra_bytes; + state.pos = bpos; + state.total_extra_bytes = total_extra_bytes; + state.mbc_index = index; total_extra_bytes } @@ -1191,11 +1227,9 @@ impl SourceMap { let mut prev_dst_line = u32::MAX; - let mut prev_extra_bytes = 0; - let mut ch_start = 0; - let mut line_prev_extra_bytes = 0; - let mut line_ch_start = 0; let mut inline_sources_content = false; + let mut ch_state = ByteToCharPosState::default(); + let mut line_state = ByteToCharPosState::default(); for (pos, lc) in mappings.iter() { let pos = *pos; @@ -1229,11 +1263,8 @@ impl SourceMap { builder.set_source_contents(src_id, Some(&f.src)); } - prev_extra_bytes = 0; - ch_start = 0; - - line_prev_extra_bytes = 0; - line_ch_start = 0; + ch_state = ByteToCharPosState::default(); + line_state = ByteToCharPosState::default(); cur_file = Some(f.clone()); &f @@ -1253,7 +1284,6 @@ impl SourceMap { Some(line) => line as u32, None => continue, }; - let mut name = config.name_for_bytepos(pos); let linebpos = f.lines[line as usize]; debug_assert!( @@ -1263,18 +1293,21 @@ impl SourceMap { pos, linebpos, ); - let chpos = - pos.to_u32() - self.calc_extra_bytes(f, &mut prev_extra_bytes, &mut ch_start, pos); - let linechpos = linebpos.to_u32() - - self.calc_extra_bytes( - f, - &mut line_prev_extra_bytes, - &mut line_ch_start, - linebpos, - ); - let mut col = max(chpos, linechpos) - min(chpos, linechpos); + let linechpos = + linebpos.to_u32() - self.calc_utf16_offset(f, linebpos, &mut line_state); + let chpos = pos.to_u32() - self.calc_utf16_offset(f, pos, &mut ch_state); + debug_assert!( + chpos >= linechpos, + "{}: chpos = {:?}; linechpos = {:?};", + f.name, + chpos, + linechpos, + ); + + let mut col = chpos - linechpos; + let mut name = None; if let Some(orig) = &orig { if let Some(token) = orig .lookup_token(line, col) @@ -1298,7 +1331,9 @@ impl SourceMap { } } - let name_idx = name.map(|name| builder.add_name(name)); + let name_idx = name + .or_else(|| config.name_for_bytepos(pos)) + .map(|name| builder.add_name(name)); builder.add_raw(lc.line, lc.col, line, col, Some(src_id), name_idx); prev_dst_line = lc.line; @@ -1434,6 +1469,20 @@ impl SourceMapGenConfig for DefaultSourceMapGenConfig { } } +/// Stores the state of the last conversion between BytePos and CharPos. +#[derive(Debug, Clone, Default)] +pub struct ByteToCharPosState { + /// The last BytePos to convert. + pos: BytePos, + + /// The total number of extra chars in the UTF-8 encoding. + total_extra_bytes: u32, + + /// The index of the last MultiByteChar read to compute the extra bytes of + /// the last conversion. + mbc_index: usize, +} + // _____________________________________________________________________________ // Tests // @@ -1653,6 +1702,52 @@ mod tests { assert!(sm.merge_spans(span1, span2).is_none()); } + #[test] + fn calc_utf16_offset() { + let input = "t¢e∆s💩t"; + let sm = SourceMap::new(FilePathMapping::empty()); + let file = sm.new_source_file(PathBuf::from("blork.rs").into(), input.to_string()); + + let mut state = ByteToCharPosState::default(); + let mut bpos = file.start_pos; + let mut cpos = CharPos(bpos.to_usize()); + for c in input.chars() { + let actual = bpos.to_u32() - sm.calc_utf16_offset(&file, bpos, &mut state); + + assert_eq!(actual, cpos.to_u32()); + + bpos = bpos + BytePos(c.len_utf8() as u32); + cpos = cpos + CharPos(c.len_utf16()); + } + + for c in input.chars().rev() { + bpos = bpos - BytePos(c.len_utf8() as u32); + cpos = cpos - CharPos(c.len_utf16()); + + let actual = bpos.to_u32() - sm.calc_utf16_offset(&file, bpos, &mut state); + + assert_eq!(actual, cpos.to_u32()); + } + } + + #[test] + fn bytepos_to_charpos() { + let input = "t¢e∆s💩t"; + let sm = SourceMap::new(FilePathMapping::empty()); + let file = sm.new_source_file(PathBuf::from("blork.rs").into(), input.to_string()); + + let mut bpos = file.start_pos; + let mut cpos = CharPos(0); + for c in input.chars() { + let actual = sm.bytepos_to_file_charpos_with(&file, bpos); + + assert_eq!(actual, cpos); + + bpos = bpos + BytePos(c.len_utf8() as u32); + cpos = cpos + CharPos(c.len_utf16()); + } + } + /// Returns the span corresponding to the `n`th occurrence of /// `substring` in `source_text`. trait SourceMapExtension { diff --git a/crates/swc_common/src/syntax_pos.rs b/crates/swc_common/src/syntax_pos.rs index a9bb8bbb6fe..97b6fed1c59 100644 --- a/crates/swc_common/src/syntax_pos.rs +++ b/crates/swc_common/src/syntax_pos.rs @@ -737,6 +737,21 @@ pub struct MultiByteChar { pub bytes: u8, } +impl MultiByteChar { + /// Computes the extra number of UTF-8 bytes necessary to encode a code + /// point, compared to UTF-16 encoding. + /// + /// 1, 2, and 3 UTF-8 bytes encode into 1 UTF-16 char, but 4 UTF-8 bytes + /// encode into 2. + pub fn byte_to_char_diff(&self) -> u8 { + if self.bytes == 4 { + 2 + } else { + self.bytes - 1 + } + } +} + /// Identifies an offset of a non-narrow character in a SourceFile #[cfg_attr( any(feature = "rkyv-impl", feature = "rkyv-bytecheck-impl"), @@ -1002,7 +1017,9 @@ pub trait Pos { /// - Values larger than `u32::MAX - 2^16` are reserved for the comments. /// /// `u32::MAX` is special value used to generate source map entries. -#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Debug, Serialize, Deserialize)] +#[derive( + Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Debug, Serialize, Deserialize, Default, +)] #[serde(transparent)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] #[cfg_attr( diff --git a/crates/swc_estree_compat/src/babelify/mod.rs b/crates/swc_estree_compat/src/babelify/mod.rs index cb9cd18f381..c5713970f60 100644 --- a/crates/swc_estree_compat/src/babelify/mod.rs +++ b/crates/swc_estree_compat/src/babelify/mod.rs @@ -40,21 +40,7 @@ impl Context { return (None, None); } - // We rename this to feel more comfortable while doing math. - let start_offset = self.fm.start_pos; - - let mut start = span.lo.0 - start_offset.0; - let mut end = span.hi.0 - start_offset.0; - - for mb in self.fm.multibyte_chars.iter() { - if mb.pos < span.lo { - start -= (mb.bytes - 1) as u32; - } - - if mb.pos < span.hi { - end -= (mb.bytes - 1) as u32; - } - } + let (start, end) = self.cm.span_to_char_offset(&self.fm, span); (Some(start), Some(end)) }