fix(common): Fix BytePos -> CharPos calculations (#6574)

**Description:**

This fixes the BytePos -> CharPos calculation necessary for source maps.
There were a few issues in the old code:

1. UTF-8 maps 1-3 bytes into 1 UTF-16 char, but 4 bytes into 2 UTF-16 chars
2. The starting offset was not recorded when we reached the end of the `multibyte_chars` iteration
3. The `mappings` can be unordered, meaning we need to restart the UTF-16 offset calculation

**Related issue:**

 - Closes https://github.com/swc-project/swc/issues/6552.
This commit is contained in:
Justin Ridgewell 2022-12-04 00:52:00 -05:00 committed by GitHub
parent 8bee06f78f
commit a203fdb1b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 245 additions and 63 deletions

View File

@ -0,0 +1,20 @@
{
"sourceMaps": true,
"jsc": {
"parser": {
"syntax": "ecmascript",
"jsx": false
},
"target": "es5",
"loose": false,
"minify": {
"compress": false,
"mangle": false
}
},
"module": {
"type": "commonjs"
},
"minify": true,
"isModule": true
}

View File

@ -0,0 +1,4 @@
const xxx = ', something';
console.error(`${message}`);
const bbb = '';
//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJuYW1lcyI6WyJ4eHgiLCJjb25zb2xlIiwiZXJyb3IiLCJtZXNzYWdlIiwiYmJiIl0sInNvdXJjZXMiOlsidW5rbm93biJdLCJzb3VyY2VzQ29udGVudCI6WyJjb25zdCB4eHggPSAnLCBzb21ldGhpbmcnXG5jb25zb2xlLmVycm9yKGDinYwgJHttZXNzYWdlfWApO1xuXG5jb25zdCBiYmIgPSAnJ1xuIl0sIm1hcHBpbmdzIjoiQUFBQSxNQUFNQSxHQUFHLEdBQUcsYUFBWjtBQUNBQyxPQUFPLENBQUNDLEtBQVIsQ0FBZSxLQUFJQyxPQUFRLEVBQTNCO0FBRUEsTUFBTUMsR0FBRyxHQUFHLEVBQVoifQ==

View File

@ -0,0 +1 @@
"use strict";var xxx=", something";console.error("❌ ".concat(message));var bbb="";

View File

@ -0,0 +1,17 @@
{
"mappings": "AAAA,aAAA,IAAMA,IAAM,cACZC,QAAQC,KAAK,CAAC,AAAC,KAAY,OAARC,UACnB,IAAMC,IAAM",
"names": [
"xxx",
"console",
"error",
"message",
"bbb"
],
"sources": [
"../../input/index.js"
],
"sourcesContent": [
"const xxx = ', something';\nconsole.error(`❌ ${message}`);\nconst bbb = '';\n//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJuYW1lcyI6WyJ4eHgiLCJjb25zb2xlIiwiZXJyb3IiLCJtZXNzYWdlIiwiYmJiIl0sInNvdXJjZXMiOlsidW5rbm93biJdLCJzb3VyY2VzQ29udGVudCI6WyJjb25zdCB4eHggPSAnLCBzb21ldGhpbmcnXG5jb25zb2xlLmVycm9yKGDinYwgJHttZXNzYWdlfWApO1xuXG5jb25zdCBiYmIgPSAnJ1xuIl0sIm1hcHBpbmdzIjoiQUFBQSxNQUFNQSxHQUFHLEdBQUcsYUFBWjtBQUNBQyxPQUFPLENBQUNDLEtBQVIsQ0FBZSxLQUFJQyxPQUFRLEVBQTNCO0FBRUEsTUFBTUMsR0FBRyxHQUFHLEVBQVoifQ==\n"
],
"version": 3
}

View File

@ -0,0 +1,20 @@
{
"sourceMaps": true,
"jsc": {
"parser": {
"syntax": "ecmascript",
"jsx": false
},
"target": "es5",
"loose": false,
"minify": {
"compress": false,
"mangle": false
}
},
"module": {
"type": "commonjs"
},
"minify": true,
"isModule": true
}

View File

@ -0,0 +1,4 @@
const xxx = ', something'
console.error(`${message}`);
const bbb = ''

View File

@ -0,0 +1 @@
"use strict";var xxx=", something";console.error("❌ ".concat(message));var bbb="";

View File

@ -0,0 +1,17 @@
{
"mappings": "AAAA,aAAA,IAAMA,IAAM,cACZC,QAAQC,KAAK,CAAC,AAAC,KAAY,OAARC,UAEnB,IAAMC,IAAM",
"names": [
"xxx",
"console",
"error",
"message",
"bbb"
],
"sources": [
"../../input/index.js"
],
"sourcesContent": [
"const xxx = ', something'\nconsole.error(`❌ ${message}`);\n\nconst bbb = ''\n"
],
"version": 3
}

View File

@ -17,9 +17,7 @@
//! within the SourceMap, which upon request can be converted to line and column
//! information, source code snippets, etc.
use std::{
cmp,
cmp::{max, min},
env, fs,
cmp, env, fs,
hash::Hash,
io,
path::{Path, PathBuf},
@ -295,8 +293,7 @@ impl SourceMap {
);
let linechpos = self.bytepos_to_file_charpos_with(&f, linebpos);
let col = max(chpos, linechpos) - min(chpos, linechpos);
let col = chpos - linechpos;
let col_display = {
let start_width_idx = f
@ -954,7 +951,7 @@ impl SourceMap {
}
fn bytepos_to_file_charpos_with(&self, map: &SourceFile, bpos: BytePos) -> CharPos {
let total_extra_bytes = self.calc_extra_bytes(map, &mut 0, &mut 0, bpos);
let total_extra_bytes = self.calc_utf16_offset(map, bpos, &mut Default::default());
assert!(
map.start_pos.to_u32() + total_extra_bytes <= bpos.to_u32(),
"map.start_pos = {:?}; total_extra_bytes = {}; bpos = {:?}",
@ -965,23 +962,43 @@ impl SourceMap {
CharPos(bpos.to_usize() - map.start_pos.to_usize() - total_extra_bytes as usize)
}
/// Converts an absolute BytePos to a CharPos relative to the source_file.
fn calc_extra_bytes(
&self,
map: &SourceFile,
prev_total_extra_bytes: &mut u32,
start: &mut usize,
bpos: BytePos,
) -> u32 {
// The number of extra bytes due to multibyte chars in the SourceFile
let mut total_extra_bytes = *prev_total_extra_bytes;
/// Converts a span of absolute BytePos to a CharPos relative to the
/// source_file.
pub fn span_to_char_offset(&self, file: &SourceFile, span: Span) -> (u32, u32) {
// We rename this to feel more comfortable while doing math.
let start_offset = file.start_pos;
for (i, &mbc) in map.multibyte_chars[*start..].iter().enumerate() {
debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos);
if mbc.pos < bpos {
// every character is at least one byte, so we only
// count the actual extra bytes.
total_extra_bytes += mbc.bytes as u32 - 1;
let mut state = ByteToCharPosState::default();
let start = span.lo.to_u32()
- start_offset.to_u32()
- self.calc_utf16_offset(file, span.lo, &mut state);
let end = span.hi.to_u32()
- start_offset.to_u32()
- self.calc_utf16_offset(file, span.hi, &mut state);
(start, end)
}
/// Calculates the number of excess chars seen in the UTF-8 encoding of a
/// file compared with the UTF-16 encoding.
fn calc_utf16_offset(
&self,
file: &SourceFile,
bpos: BytePos,
state: &mut ByteToCharPosState,
) -> u32 {
let mut total_extra_bytes = state.total_extra_bytes;
let mut index = state.mbc_index;
if bpos >= state.pos {
let range = index..file.multibyte_chars.len();
for i in range {
let mbc = &file.multibyte_chars[i];
debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos);
if mbc.pos >= bpos {
break;
}
total_extra_bytes += mbc.byte_to_char_diff() as u32;
// We should never see a byte position in the middle of a
// character
debug_assert!(
@ -991,13 +1008,32 @@ impl SourceMap {
mbc.pos,
mbc.bytes
);
} else {
*start += i;
break;
index += 1;
}
} else {
let range = 0..index;
for i in range.rev() {
let mbc = &file.multibyte_chars[i];
debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos);
if mbc.pos < bpos {
break;
}
total_extra_bytes -= mbc.byte_to_char_diff() as u32;
// We should never see a byte position in the middle of a
// character
debug_assert!(
bpos.to_u32() <= mbc.pos.to_u32(),
"bpos = {:?}, mbc.pos = {:?}",
bpos,
mbc.pos,
);
index -= 1;
}
}
*prev_total_extra_bytes = total_extra_bytes;
state.pos = bpos;
state.total_extra_bytes = total_extra_bytes;
state.mbc_index = index;
total_extra_bytes
}
@ -1191,11 +1227,9 @@ impl SourceMap {
let mut prev_dst_line = u32::MAX;
let mut prev_extra_bytes = 0;
let mut ch_start = 0;
let mut line_prev_extra_bytes = 0;
let mut line_ch_start = 0;
let mut inline_sources_content = false;
let mut ch_state = ByteToCharPosState::default();
let mut line_state = ByteToCharPosState::default();
for (pos, lc) in mappings.iter() {
let pos = *pos;
@ -1229,11 +1263,8 @@ impl SourceMap {
builder.set_source_contents(src_id, Some(&f.src));
}
prev_extra_bytes = 0;
ch_start = 0;
line_prev_extra_bytes = 0;
line_ch_start = 0;
ch_state = ByteToCharPosState::default();
line_state = ByteToCharPosState::default();
cur_file = Some(f.clone());
&f
@ -1253,7 +1284,6 @@ impl SourceMap {
Some(line) => line as u32,
None => continue,
};
let mut name = config.name_for_bytepos(pos);
let linebpos = f.lines[line as usize];
debug_assert!(
@ -1263,18 +1293,21 @@ impl SourceMap {
pos,
linebpos,
);
let chpos =
pos.to_u32() - self.calc_extra_bytes(f, &mut prev_extra_bytes, &mut ch_start, pos);
let linechpos = linebpos.to_u32()
- self.calc_extra_bytes(
f,
&mut line_prev_extra_bytes,
&mut line_ch_start,
linebpos,
);
let mut col = max(chpos, linechpos) - min(chpos, linechpos);
let linechpos =
linebpos.to_u32() - self.calc_utf16_offset(f, linebpos, &mut line_state);
let chpos = pos.to_u32() - self.calc_utf16_offset(f, pos, &mut ch_state);
debug_assert!(
chpos >= linechpos,
"{}: chpos = {:?}; linechpos = {:?};",
f.name,
chpos,
linechpos,
);
let mut col = chpos - linechpos;
let mut name = None;
if let Some(orig) = &orig {
if let Some(token) = orig
.lookup_token(line, col)
@ -1298,7 +1331,9 @@ impl SourceMap {
}
}
let name_idx = name.map(|name| builder.add_name(name));
let name_idx = name
.or_else(|| config.name_for_bytepos(pos))
.map(|name| builder.add_name(name));
builder.add_raw(lc.line, lc.col, line, col, Some(src_id), name_idx);
prev_dst_line = lc.line;
@ -1434,6 +1469,20 @@ impl SourceMapGenConfig for DefaultSourceMapGenConfig {
}
}
/// Stores the state of the last conversion between BytePos and CharPos.
#[derive(Debug, Clone, Default)]
pub struct ByteToCharPosState {
/// The last BytePos to convert.
pos: BytePos,
/// The total number of extra chars in the UTF-8 encoding.
total_extra_bytes: u32,
/// The index of the last MultiByteChar read to compute the extra bytes of
/// the last conversion.
mbc_index: usize,
}
// _____________________________________________________________________________
// Tests
//
@ -1653,6 +1702,52 @@ mod tests {
assert!(sm.merge_spans(span1, span2).is_none());
}
#[test]
fn calc_utf16_offset() {
let input = "t¢e∆s💩t";
let sm = SourceMap::new(FilePathMapping::empty());
let file = sm.new_source_file(PathBuf::from("blork.rs").into(), input.to_string());
let mut state = ByteToCharPosState::default();
let mut bpos = file.start_pos;
let mut cpos = CharPos(bpos.to_usize());
for c in input.chars() {
let actual = bpos.to_u32() - sm.calc_utf16_offset(&file, bpos, &mut state);
assert_eq!(actual, cpos.to_u32());
bpos = bpos + BytePos(c.len_utf8() as u32);
cpos = cpos + CharPos(c.len_utf16());
}
for c in input.chars().rev() {
bpos = bpos - BytePos(c.len_utf8() as u32);
cpos = cpos - CharPos(c.len_utf16());
let actual = bpos.to_u32() - sm.calc_utf16_offset(&file, bpos, &mut state);
assert_eq!(actual, cpos.to_u32());
}
}
#[test]
fn bytepos_to_charpos() {
let input = "t¢e∆s💩t";
let sm = SourceMap::new(FilePathMapping::empty());
let file = sm.new_source_file(PathBuf::from("blork.rs").into(), input.to_string());
let mut bpos = file.start_pos;
let mut cpos = CharPos(0);
for c in input.chars() {
let actual = sm.bytepos_to_file_charpos_with(&file, bpos);
assert_eq!(actual, cpos);
bpos = bpos + BytePos(c.len_utf8() as u32);
cpos = cpos + CharPos(c.len_utf16());
}
}
/// Returns the span corresponding to the `n`th occurrence of
/// `substring` in `source_text`.
trait SourceMapExtension {

View File

@ -737,6 +737,21 @@ pub struct MultiByteChar {
pub bytes: u8,
}
impl MultiByteChar {
/// Computes the extra number of UTF-8 bytes necessary to encode a code
/// point, compared to UTF-16 encoding.
///
/// 1, 2, and 3 UTF-8 bytes encode into 1 UTF-16 char, but 4 UTF-8 bytes
/// encode into 2.
pub fn byte_to_char_diff(&self) -> u8 {
if self.bytes == 4 {
2
} else {
self.bytes - 1
}
}
}
/// Identifies an offset of a non-narrow character in a SourceFile
#[cfg_attr(
any(feature = "rkyv-impl", feature = "rkyv-bytecheck-impl"),
@ -1002,7 +1017,9 @@ pub trait Pos {
/// - Values larger than `u32::MAX - 2^16` are reserved for the comments.
///
/// `u32::MAX` is special value used to generate source map entries.
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Debug, Serialize, Deserialize)]
#[derive(
Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Debug, Serialize, Deserialize, Default,
)]
#[serde(transparent)]
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
#[cfg_attr(

View File

@ -40,21 +40,7 @@ impl Context {
return (None, None);
}
// We rename this to feel more comfortable while doing math.
let start_offset = self.fm.start_pos;
let mut start = span.lo.0 - start_offset.0;
let mut end = span.hi.0 - start_offset.0;
for mb in self.fm.multibyte_chars.iter() {
if mb.pos < span.lo {
start -= (mb.bytes - 1) as u32;
}
if mb.pos < span.hi {
end -= (mb.bytes - 1) as u32;
}
}
let (start, end) = self.cm.span_to_char_offset(&self.fm, span);
(Some(start), Some(end))
}