mirror of
https://github.com/swc-project/swc.git
synced 2024-11-24 10:12:42 +03:00
fix(common): Fix BytePos
-> CharPos
calculations (#6574)
**Description:** This fixes the BytePos -> CharPos calculation necessary for source maps. There were a few issues in the old code: 1. UTF-8 maps 1-3 bytes into 1 UTF-16 char, but 4 bytes into 2 UTF-16 chars 2. The starting offset was not recorded when we reached the end of the `multibyte_chars` iteration 3. The `mappings` can be unordered, meaning we need to restart the UTF-16 offset calculation **Related issue:** - Closes https://github.com/swc-project/swc/issues/6552.
This commit is contained in:
parent
8bee06f78f
commit
a203fdb1b6
@ -0,0 +1,20 @@
|
||||
{
|
||||
"sourceMaps": true,
|
||||
"jsc": {
|
||||
"parser": {
|
||||
"syntax": "ecmascript",
|
||||
"jsx": false
|
||||
},
|
||||
"target": "es5",
|
||||
"loose": false,
|
||||
"minify": {
|
||||
"compress": false,
|
||||
"mangle": false
|
||||
}
|
||||
},
|
||||
"module": {
|
||||
"type": "commonjs"
|
||||
},
|
||||
"minify": true,
|
||||
"isModule": true
|
||||
}
|
@ -0,0 +1,4 @@
|
||||
const xxx = ', something';
|
||||
console.error(`❌ ${message}`);
|
||||
const bbb = '';
|
||||
//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJuYW1lcyI6WyJ4eHgiLCJjb25zb2xlIiwiZXJyb3IiLCJtZXNzYWdlIiwiYmJiIl0sInNvdXJjZXMiOlsidW5rbm93biJdLCJzb3VyY2VzQ29udGVudCI6WyJjb25zdCB4eHggPSAnLCBzb21ldGhpbmcnXG5jb25zb2xlLmVycm9yKGDinYwgJHttZXNzYWdlfWApO1xuXG5jb25zdCBiYmIgPSAnJ1xuIl0sIm1hcHBpbmdzIjoiQUFBQSxNQUFNQSxHQUFHLEdBQUcsYUFBWjtBQUNBQyxPQUFPLENBQUNDLEtBQVIsQ0FBZSxLQUFJQyxPQUFRLEVBQTNCO0FBRUEsTUFBTUMsR0FBRyxHQUFHLEVBQVoifQ==
|
@ -0,0 +1 @@
|
||||
"use strict";var xxx=", something";console.error("❌ ".concat(message));var bbb="";
|
@ -0,0 +1,17 @@
|
||||
{
|
||||
"mappings": "AAAA,aAAA,IAAMA,IAAM,cACZC,QAAQC,KAAK,CAAC,AAAC,KAAY,OAARC,UACnB,IAAMC,IAAM",
|
||||
"names": [
|
||||
"xxx",
|
||||
"console",
|
||||
"error",
|
||||
"message",
|
||||
"bbb"
|
||||
],
|
||||
"sources": [
|
||||
"../../input/index.js"
|
||||
],
|
||||
"sourcesContent": [
|
||||
"const xxx = ', something';\nconsole.error(`❌ ${message}`);\nconst bbb = '';\n//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJuYW1lcyI6WyJ4eHgiLCJjb25zb2xlIiwiZXJyb3IiLCJtZXNzYWdlIiwiYmJiIl0sInNvdXJjZXMiOlsidW5rbm93biJdLCJzb3VyY2VzQ29udGVudCI6WyJjb25zdCB4eHggPSAnLCBzb21ldGhpbmcnXG5jb25zb2xlLmVycm9yKGDinYwgJHttZXNzYWdlfWApO1xuXG5jb25zdCBiYmIgPSAnJ1xuIl0sIm1hcHBpbmdzIjoiQUFBQSxNQUFNQSxHQUFHLEdBQUcsYUFBWjtBQUNBQyxPQUFPLENBQUNDLEtBQVIsQ0FBZSxLQUFJQyxPQUFRLEVBQTNCO0FBRUEsTUFBTUMsR0FBRyxHQUFHLEVBQVoifQ==\n"
|
||||
],
|
||||
"version": 3
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
{
|
||||
"sourceMaps": true,
|
||||
"jsc": {
|
||||
"parser": {
|
||||
"syntax": "ecmascript",
|
||||
"jsx": false
|
||||
},
|
||||
"target": "es5",
|
||||
"loose": false,
|
||||
"minify": {
|
||||
"compress": false,
|
||||
"mangle": false
|
||||
}
|
||||
},
|
||||
"module": {
|
||||
"type": "commonjs"
|
||||
},
|
||||
"minify": true,
|
||||
"isModule": true
|
||||
}
|
@ -0,0 +1,4 @@
|
||||
const xxx = ', something'
|
||||
console.error(`❌ ${message}`);
|
||||
|
||||
const bbb = ''
|
@ -0,0 +1 @@
|
||||
"use strict";var xxx=", something";console.error("❌ ".concat(message));var bbb="";
|
@ -0,0 +1,17 @@
|
||||
{
|
||||
"mappings": "AAAA,aAAA,IAAMA,IAAM,cACZC,QAAQC,KAAK,CAAC,AAAC,KAAY,OAARC,UAEnB,IAAMC,IAAM",
|
||||
"names": [
|
||||
"xxx",
|
||||
"console",
|
||||
"error",
|
||||
"message",
|
||||
"bbb"
|
||||
],
|
||||
"sources": [
|
||||
"../../input/index.js"
|
||||
],
|
||||
"sourcesContent": [
|
||||
"const xxx = ', something'\nconsole.error(`❌ ${message}`);\n\nconst bbb = ''\n"
|
||||
],
|
||||
"version": 3
|
||||
}
|
@ -17,9 +17,7 @@
|
||||
//! within the SourceMap, which upon request can be converted to line and column
|
||||
//! information, source code snippets, etc.
|
||||
use std::{
|
||||
cmp,
|
||||
cmp::{max, min},
|
||||
env, fs,
|
||||
cmp, env, fs,
|
||||
hash::Hash,
|
||||
io,
|
||||
path::{Path, PathBuf},
|
||||
@ -295,8 +293,7 @@ impl SourceMap {
|
||||
);
|
||||
|
||||
let linechpos = self.bytepos_to_file_charpos_with(&f, linebpos);
|
||||
|
||||
let col = max(chpos, linechpos) - min(chpos, linechpos);
|
||||
let col = chpos - linechpos;
|
||||
|
||||
let col_display = {
|
||||
let start_width_idx = f
|
||||
@ -954,7 +951,7 @@ impl SourceMap {
|
||||
}
|
||||
|
||||
fn bytepos_to_file_charpos_with(&self, map: &SourceFile, bpos: BytePos) -> CharPos {
|
||||
let total_extra_bytes = self.calc_extra_bytes(map, &mut 0, &mut 0, bpos);
|
||||
let total_extra_bytes = self.calc_utf16_offset(map, bpos, &mut Default::default());
|
||||
assert!(
|
||||
map.start_pos.to_u32() + total_extra_bytes <= bpos.to_u32(),
|
||||
"map.start_pos = {:?}; total_extra_bytes = {}; bpos = {:?}",
|
||||
@ -965,23 +962,43 @@ impl SourceMap {
|
||||
CharPos(bpos.to_usize() - map.start_pos.to_usize() - total_extra_bytes as usize)
|
||||
}
|
||||
|
||||
/// Converts an absolute BytePos to a CharPos relative to the source_file.
|
||||
fn calc_extra_bytes(
|
||||
&self,
|
||||
map: &SourceFile,
|
||||
prev_total_extra_bytes: &mut u32,
|
||||
start: &mut usize,
|
||||
bpos: BytePos,
|
||||
) -> u32 {
|
||||
// The number of extra bytes due to multibyte chars in the SourceFile
|
||||
let mut total_extra_bytes = *prev_total_extra_bytes;
|
||||
/// Converts a span of absolute BytePos to a CharPos relative to the
|
||||
/// source_file.
|
||||
pub fn span_to_char_offset(&self, file: &SourceFile, span: Span) -> (u32, u32) {
|
||||
// We rename this to feel more comfortable while doing math.
|
||||
let start_offset = file.start_pos;
|
||||
|
||||
for (i, &mbc) in map.multibyte_chars[*start..].iter().enumerate() {
|
||||
debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos);
|
||||
if mbc.pos < bpos {
|
||||
// every character is at least one byte, so we only
|
||||
// count the actual extra bytes.
|
||||
total_extra_bytes += mbc.bytes as u32 - 1;
|
||||
let mut state = ByteToCharPosState::default();
|
||||
let start = span.lo.to_u32()
|
||||
- start_offset.to_u32()
|
||||
- self.calc_utf16_offset(file, span.lo, &mut state);
|
||||
let end = span.hi.to_u32()
|
||||
- start_offset.to_u32()
|
||||
- self.calc_utf16_offset(file, span.hi, &mut state);
|
||||
|
||||
(start, end)
|
||||
}
|
||||
|
||||
/// Calculates the number of excess chars seen in the UTF-8 encoding of a
|
||||
/// file compared with the UTF-16 encoding.
|
||||
fn calc_utf16_offset(
|
||||
&self,
|
||||
file: &SourceFile,
|
||||
bpos: BytePos,
|
||||
state: &mut ByteToCharPosState,
|
||||
) -> u32 {
|
||||
let mut total_extra_bytes = state.total_extra_bytes;
|
||||
let mut index = state.mbc_index;
|
||||
|
||||
if bpos >= state.pos {
|
||||
let range = index..file.multibyte_chars.len();
|
||||
for i in range {
|
||||
let mbc = &file.multibyte_chars[i];
|
||||
debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos);
|
||||
if mbc.pos >= bpos {
|
||||
break;
|
||||
}
|
||||
total_extra_bytes += mbc.byte_to_char_diff() as u32;
|
||||
// We should never see a byte position in the middle of a
|
||||
// character
|
||||
debug_assert!(
|
||||
@ -991,13 +1008,32 @@ impl SourceMap {
|
||||
mbc.pos,
|
||||
mbc.bytes
|
||||
);
|
||||
} else {
|
||||
*start += i;
|
||||
break;
|
||||
index += 1;
|
||||
}
|
||||
} else {
|
||||
let range = 0..index;
|
||||
for i in range.rev() {
|
||||
let mbc = &file.multibyte_chars[i];
|
||||
debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos);
|
||||
if mbc.pos < bpos {
|
||||
break;
|
||||
}
|
||||
total_extra_bytes -= mbc.byte_to_char_diff() as u32;
|
||||
// We should never see a byte position in the middle of a
|
||||
// character
|
||||
debug_assert!(
|
||||
bpos.to_u32() <= mbc.pos.to_u32(),
|
||||
"bpos = {:?}, mbc.pos = {:?}",
|
||||
bpos,
|
||||
mbc.pos,
|
||||
);
|
||||
index -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
*prev_total_extra_bytes = total_extra_bytes;
|
||||
state.pos = bpos;
|
||||
state.total_extra_bytes = total_extra_bytes;
|
||||
state.mbc_index = index;
|
||||
|
||||
total_extra_bytes
|
||||
}
|
||||
@ -1191,11 +1227,9 @@ impl SourceMap {
|
||||
|
||||
let mut prev_dst_line = u32::MAX;
|
||||
|
||||
let mut prev_extra_bytes = 0;
|
||||
let mut ch_start = 0;
|
||||
let mut line_prev_extra_bytes = 0;
|
||||
let mut line_ch_start = 0;
|
||||
let mut inline_sources_content = false;
|
||||
let mut ch_state = ByteToCharPosState::default();
|
||||
let mut line_state = ByteToCharPosState::default();
|
||||
|
||||
for (pos, lc) in mappings.iter() {
|
||||
let pos = *pos;
|
||||
@ -1229,11 +1263,8 @@ impl SourceMap {
|
||||
builder.set_source_contents(src_id, Some(&f.src));
|
||||
}
|
||||
|
||||
prev_extra_bytes = 0;
|
||||
ch_start = 0;
|
||||
|
||||
line_prev_extra_bytes = 0;
|
||||
line_ch_start = 0;
|
||||
ch_state = ByteToCharPosState::default();
|
||||
line_state = ByteToCharPosState::default();
|
||||
|
||||
cur_file = Some(f.clone());
|
||||
&f
|
||||
@ -1253,7 +1284,6 @@ impl SourceMap {
|
||||
Some(line) => line as u32,
|
||||
None => continue,
|
||||
};
|
||||
let mut name = config.name_for_bytepos(pos);
|
||||
|
||||
let linebpos = f.lines[line as usize];
|
||||
debug_assert!(
|
||||
@ -1263,18 +1293,21 @@ impl SourceMap {
|
||||
pos,
|
||||
linebpos,
|
||||
);
|
||||
let chpos =
|
||||
pos.to_u32() - self.calc_extra_bytes(f, &mut prev_extra_bytes, &mut ch_start, pos);
|
||||
let linechpos = linebpos.to_u32()
|
||||
- self.calc_extra_bytes(
|
||||
f,
|
||||
&mut line_prev_extra_bytes,
|
||||
&mut line_ch_start,
|
||||
linebpos,
|
||||
);
|
||||
|
||||
let mut col = max(chpos, linechpos) - min(chpos, linechpos);
|
||||
let linechpos =
|
||||
linebpos.to_u32() - self.calc_utf16_offset(f, linebpos, &mut line_state);
|
||||
let chpos = pos.to_u32() - self.calc_utf16_offset(f, pos, &mut ch_state);
|
||||
|
||||
debug_assert!(
|
||||
chpos >= linechpos,
|
||||
"{}: chpos = {:?}; linechpos = {:?};",
|
||||
f.name,
|
||||
chpos,
|
||||
linechpos,
|
||||
);
|
||||
|
||||
let mut col = chpos - linechpos;
|
||||
let mut name = None;
|
||||
if let Some(orig) = &orig {
|
||||
if let Some(token) = orig
|
||||
.lookup_token(line, col)
|
||||
@ -1298,7 +1331,9 @@ impl SourceMap {
|
||||
}
|
||||
}
|
||||
|
||||
let name_idx = name.map(|name| builder.add_name(name));
|
||||
let name_idx = name
|
||||
.or_else(|| config.name_for_bytepos(pos))
|
||||
.map(|name| builder.add_name(name));
|
||||
|
||||
builder.add_raw(lc.line, lc.col, line, col, Some(src_id), name_idx);
|
||||
prev_dst_line = lc.line;
|
||||
@ -1434,6 +1469,20 @@ impl SourceMapGenConfig for DefaultSourceMapGenConfig {
|
||||
}
|
||||
}
|
||||
|
||||
/// Stores the state of the last conversion between BytePos and CharPos.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct ByteToCharPosState {
|
||||
/// The last BytePos to convert.
|
||||
pos: BytePos,
|
||||
|
||||
/// The total number of extra chars in the UTF-8 encoding.
|
||||
total_extra_bytes: u32,
|
||||
|
||||
/// The index of the last MultiByteChar read to compute the extra bytes of
|
||||
/// the last conversion.
|
||||
mbc_index: usize,
|
||||
}
|
||||
|
||||
// _____________________________________________________________________________
|
||||
// Tests
|
||||
//
|
||||
@ -1653,6 +1702,52 @@ mod tests {
|
||||
assert!(sm.merge_spans(span1, span2).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn calc_utf16_offset() {
|
||||
let input = "t¢e∆s💩t";
|
||||
let sm = SourceMap::new(FilePathMapping::empty());
|
||||
let file = sm.new_source_file(PathBuf::from("blork.rs").into(), input.to_string());
|
||||
|
||||
let mut state = ByteToCharPosState::default();
|
||||
let mut bpos = file.start_pos;
|
||||
let mut cpos = CharPos(bpos.to_usize());
|
||||
for c in input.chars() {
|
||||
let actual = bpos.to_u32() - sm.calc_utf16_offset(&file, bpos, &mut state);
|
||||
|
||||
assert_eq!(actual, cpos.to_u32());
|
||||
|
||||
bpos = bpos + BytePos(c.len_utf8() as u32);
|
||||
cpos = cpos + CharPos(c.len_utf16());
|
||||
}
|
||||
|
||||
for c in input.chars().rev() {
|
||||
bpos = bpos - BytePos(c.len_utf8() as u32);
|
||||
cpos = cpos - CharPos(c.len_utf16());
|
||||
|
||||
let actual = bpos.to_u32() - sm.calc_utf16_offset(&file, bpos, &mut state);
|
||||
|
||||
assert_eq!(actual, cpos.to_u32());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bytepos_to_charpos() {
|
||||
let input = "t¢e∆s💩t";
|
||||
let sm = SourceMap::new(FilePathMapping::empty());
|
||||
let file = sm.new_source_file(PathBuf::from("blork.rs").into(), input.to_string());
|
||||
|
||||
let mut bpos = file.start_pos;
|
||||
let mut cpos = CharPos(0);
|
||||
for c in input.chars() {
|
||||
let actual = sm.bytepos_to_file_charpos_with(&file, bpos);
|
||||
|
||||
assert_eq!(actual, cpos);
|
||||
|
||||
bpos = bpos + BytePos(c.len_utf8() as u32);
|
||||
cpos = cpos + CharPos(c.len_utf16());
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the span corresponding to the `n`th occurrence of
|
||||
/// `substring` in `source_text`.
|
||||
trait SourceMapExtension {
|
||||
|
@ -737,6 +737,21 @@ pub struct MultiByteChar {
|
||||
pub bytes: u8,
|
||||
}
|
||||
|
||||
impl MultiByteChar {
|
||||
/// Computes the extra number of UTF-8 bytes necessary to encode a code
|
||||
/// point, compared to UTF-16 encoding.
|
||||
///
|
||||
/// 1, 2, and 3 UTF-8 bytes encode into 1 UTF-16 char, but 4 UTF-8 bytes
|
||||
/// encode into 2.
|
||||
pub fn byte_to_char_diff(&self) -> u8 {
|
||||
if self.bytes == 4 {
|
||||
2
|
||||
} else {
|
||||
self.bytes - 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Identifies an offset of a non-narrow character in a SourceFile
|
||||
#[cfg_attr(
|
||||
any(feature = "rkyv-impl", feature = "rkyv-bytecheck-impl"),
|
||||
@ -1002,7 +1017,9 @@ pub trait Pos {
|
||||
/// - Values larger than `u32::MAX - 2^16` are reserved for the comments.
|
||||
///
|
||||
/// `u32::MAX` is special value used to generate source map entries.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Debug, Serialize, Deserialize)]
|
||||
#[derive(
|
||||
Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Debug, Serialize, Deserialize, Default,
|
||||
)]
|
||||
#[serde(transparent)]
|
||||
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
|
||||
#[cfg_attr(
|
||||
|
@ -40,21 +40,7 @@ impl Context {
|
||||
return (None, None);
|
||||
}
|
||||
|
||||
// We rename this to feel more comfortable while doing math.
|
||||
let start_offset = self.fm.start_pos;
|
||||
|
||||
let mut start = span.lo.0 - start_offset.0;
|
||||
let mut end = span.hi.0 - start_offset.0;
|
||||
|
||||
for mb in self.fm.multibyte_chars.iter() {
|
||||
if mb.pos < span.lo {
|
||||
start -= (mb.bytes - 1) as u32;
|
||||
}
|
||||
|
||||
if mb.pos < span.hi {
|
||||
end -= (mb.bytes - 1) as u32;
|
||||
}
|
||||
}
|
||||
let (start, end) = self.cm.span_to_char_offset(&self.fm, span);
|
||||
|
||||
(Some(start), Some(end))
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user