Skip to content

Commit a203fdb

Browse files
authoredDec 4, 2022
fix(common): Fix BytePos -> CharPos calculations (#6574)
**Description:** This fixes the BytePos -> CharPos calculation necessary for source maps. There were a few issues in the old code: 1. UTF-8 maps 1-3 bytes into 1 UTF-16 char, but 4 bytes into 2 UTF-16 chars 2. The starting offset was not recorded when we reached the end of the `multibyte_chars` iteration 3. The `mappings` can be unordered, meaning we need to restart the UTF-16 offset calculation **Related issue:** - Closes #6552.
1 parent 8bee06f commit a203fdb

File tree

11 files changed

+243
-61
lines changed

11 files changed

+243
-61
lines changed
 
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"sourceMaps": true,
3+
"jsc": {
4+
"parser": {
5+
"syntax": "ecmascript",
6+
"jsx": false
7+
},
8+
"target": "es5",
9+
"loose": false,
10+
"minify": {
11+
"compress": false,
12+
"mangle": false
13+
}
14+
},
15+
"module": {
16+
"type": "commonjs"
17+
},
18+
"minify": true,
19+
"isModule": true
20+
}

‎crates/swc/tests/fixture/sourcemap/issue-6552/input-map/input/index.js

+4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"use strict";var xxx=", something";console.error("❌ ".concat(message));var bbb="";
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"mappings": "AAAA,aAAA,IAAMA,IAAM,cACZC,QAAQC,KAAK,CAAC,AAAC,KAAY,OAARC,UACnB,IAAMC,IAAM",
3+
"names": [
4+
"xxx",
5+
"console",
6+
"error",
7+
"message",
8+
"bbb"
9+
],
10+
"sources": [
11+
"../../input/index.js"
12+
],
13+
"sourcesContent": [
14+
"const xxx = ', something';\nconsole.error(`❌ ${message}`);\nconst bbb = '';\n//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJuYW1lcyI6WyJ4eHgiLCJjb25zb2xlIiwiZXJyb3IiLCJtZXNzYWdlIiwiYmJiIl0sInNvdXJjZXMiOlsidW5rbm93biJdLCJzb3VyY2VzQ29udGVudCI6WyJjb25zdCB4eHggPSAnLCBzb21ldGhpbmcnXG5jb25zb2xlLmVycm9yKGDinYwgJHttZXNzYWdlfWApO1xuXG5jb25zdCBiYmIgPSAnJ1xuIl0sIm1hcHBpbmdzIjoiQUFBQSxNQUFNQSxHQUFHLEdBQUcsYUFBWjtBQUNBQyxPQUFPLENBQUNDLEtBQVIsQ0FBZSxLQUFJQyxPQUFRLEVBQTNCO0FBRUEsTUFBTUMsR0FBRyxHQUFHLEVBQVoifQ==\n"
15+
],
16+
"version": 3
17+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"sourceMaps": true,
3+
"jsc": {
4+
"parser": {
5+
"syntax": "ecmascript",
6+
"jsx": false
7+
},
8+
"target": "es5",
9+
"loose": false,
10+
"minify": {
11+
"compress": false,
12+
"mangle": false
13+
}
14+
},
15+
"module": {
16+
"type": "commonjs"
17+
},
18+
"minify": true,
19+
"isModule": true
20+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
const xxx = ', something'
2+
console.error(`❌ ${message}`);
3+
4+
const bbb = ''
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"use strict";var xxx=", something";console.error("❌ ".concat(message));var bbb="";
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"mappings": "AAAA,aAAA,IAAMA,IAAM,cACZC,QAAQC,KAAK,CAAC,AAAC,KAAY,OAARC,UAEnB,IAAMC,IAAM",
3+
"names": [
4+
"xxx",
5+
"console",
6+
"error",
7+
"message",
8+
"bbb"
9+
],
10+
"sources": [
11+
"../../input/index.js"
12+
],
13+
"sourcesContent": [
14+
"const xxx = ', something'\nconsole.error(`❌ ${message}`);\n\nconst bbb = ''\n"
15+
],
16+
"version": 3
17+
}

‎crates/swc_common/src/source_map.rs

+140-45
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@
1717
//! within the SourceMap, which upon request can be converted to line and column
1818
//! information, source code snippets, etc.
1919
use std::{
20-
cmp,
21-
cmp::{max, min},
22-
env, fs,
20+
cmp, env, fs,
2321
hash::Hash,
2422
io,
2523
path::{Path, PathBuf},
@@ -295,8 +293,7 @@ impl SourceMap {
295293
);
296294

297295
let linechpos = self.bytepos_to_file_charpos_with(&f, linebpos);
298-
299-
let col = max(chpos, linechpos) - min(chpos, linechpos);
296+
let col = chpos - linechpos;
300297

301298
let col_display = {
302299
let start_width_idx = f
@@ -954,7 +951,7 @@ impl SourceMap {
954951
}
955952

956953
fn bytepos_to_file_charpos_with(&self, map: &SourceFile, bpos: BytePos) -> CharPos {
957-
let total_extra_bytes = self.calc_extra_bytes(map, &mut 0, &mut 0, bpos);
954+
let total_extra_bytes = self.calc_utf16_offset(map, bpos, &mut Default::default());
958955
assert!(
959956
map.start_pos.to_u32() + total_extra_bytes <= bpos.to_u32(),
960957
"map.start_pos = {:?}; total_extra_bytes = {}; bpos = {:?}",
@@ -965,23 +962,43 @@ impl SourceMap {
965962
CharPos(bpos.to_usize() - map.start_pos.to_usize() - total_extra_bytes as usize)
966963
}
967964

968-
/// Converts an absolute BytePos to a CharPos relative to the source_file.
969-
fn calc_extra_bytes(
965+
/// Converts a span of absolute BytePos to a CharPos relative to the
966+
/// source_file.
967+
pub fn span_to_char_offset(&self, file: &SourceFile, span: Span) -> (u32, u32) {
968+
// We rename this to feel more comfortable while doing math.
969+
let start_offset = file.start_pos;
970+
971+
let mut state = ByteToCharPosState::default();
972+
let start = span.lo.to_u32()
973+
- start_offset.to_u32()
974+
- self.calc_utf16_offset(file, span.lo, &mut state);
975+
let end = span.hi.to_u32()
976+
- start_offset.to_u32()
977+
- self.calc_utf16_offset(file, span.hi, &mut state);
978+
979+
(start, end)
980+
}
981+
982+
/// Calculates the number of excess chars seen in the UTF-8 encoding of a
983+
/// file compared with the UTF-16 encoding.
984+
fn calc_utf16_offset(
970985
&self,
971-
map: &SourceFile,
972-
prev_total_extra_bytes: &mut u32,
973-
start: &mut usize,
986+
file: &SourceFile,
974987
bpos: BytePos,
988+
state: &mut ByteToCharPosState,
975989
) -> u32 {
976-
// The number of extra bytes due to multibyte chars in the SourceFile
977-
let mut total_extra_bytes = *prev_total_extra_bytes;
978-
979-
for (i, &mbc) in map.multibyte_chars[*start..].iter().enumerate() {
980-
debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos);
981-
if mbc.pos < bpos {
982-
// every character is at least one byte, so we only
983-
// count the actual extra bytes.
984-
total_extra_bytes += mbc.bytes as u32 - 1;
990+
let mut total_extra_bytes = state.total_extra_bytes;
991+
let mut index = state.mbc_index;
992+
993+
if bpos >= state.pos {
994+
let range = index..file.multibyte_chars.len();
995+
for i in range {
996+
let mbc = &file.multibyte_chars[i];
997+
debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos);
998+
if mbc.pos >= bpos {
999+
break;
1000+
}
1001+
total_extra_bytes += mbc.byte_to_char_diff() as u32;
9851002
// We should never see a byte position in the middle of a
9861003
// character
9871004
debug_assert!(
@@ -991,13 +1008,32 @@ impl SourceMap {
9911008
mbc.pos,
9921009
mbc.bytes
9931010
);
994-
} else {
995-
*start += i;
996-
break;
1011+
index += 1;
1012+
}
1013+
} else {
1014+
let range = 0..index;
1015+
for i in range.rev() {
1016+
let mbc = &file.multibyte_chars[i];
1017+
debug!("{}-byte char at {:?}", mbc.bytes, mbc.pos);
1018+
if mbc.pos < bpos {
1019+
break;
1020+
}
1021+
total_extra_bytes -= mbc.byte_to_char_diff() as u32;
1022+
// We should never see a byte position in the middle of a
1023+
// character
1024+
debug_assert!(
1025+
bpos.to_u32() <= mbc.pos.to_u32(),
1026+
"bpos = {:?}, mbc.pos = {:?}",
1027+
bpos,
1028+
mbc.pos,
1029+
);
1030+
index -= 1;
9971031
}
9981032
}
9991033

1000-
*prev_total_extra_bytes = total_extra_bytes;
1034+
state.pos = bpos;
1035+
state.total_extra_bytes = total_extra_bytes;
1036+
state.mbc_index = index;
10011037

10021038
total_extra_bytes
10031039
}
@@ -1191,11 +1227,9 @@ impl SourceMap {
11911227

11921228
let mut prev_dst_line = u32::MAX;
11931229

1194-
let mut prev_extra_bytes = 0;
1195-
let mut ch_start = 0;
1196-
let mut line_prev_extra_bytes = 0;
1197-
let mut line_ch_start = 0;
11981230
let mut inline_sources_content = false;
1231+
let mut ch_state = ByteToCharPosState::default();
1232+
let mut line_state = ByteToCharPosState::default();
11991233

12001234
for (pos, lc) in mappings.iter() {
12011235
let pos = *pos;
@@ -1229,11 +1263,8 @@ impl SourceMap {
12291263
builder.set_source_contents(src_id, Some(&f.src));
12301264
}
12311265

1232-
prev_extra_bytes = 0;
1233-
ch_start = 0;
1234-
1235-
line_prev_extra_bytes = 0;
1236-
line_ch_start = 0;
1266+
ch_state = ByteToCharPosState::default();
1267+
line_state = ByteToCharPosState::default();
12371268

12381269
cur_file = Some(f.clone());
12391270
&f
@@ -1253,7 +1284,6 @@ impl SourceMap {
12531284
Some(line) => line as u32,
12541285
None => continue,
12551286
};
1256-
let mut name = config.name_for_bytepos(pos);
12571287

12581288
let linebpos = f.lines[line as usize];
12591289
debug_assert!(
@@ -1263,18 +1293,21 @@ impl SourceMap {
12631293
pos,
12641294
linebpos,
12651295
);
1266-
let chpos =
1267-
pos.to_u32() - self.calc_extra_bytes(f, &mut prev_extra_bytes, &mut ch_start, pos);
1268-
let linechpos = linebpos.to_u32()
1269-
- self.calc_extra_bytes(
1270-
f,
1271-
&mut line_prev_extra_bytes,
1272-
&mut line_ch_start,
1273-
linebpos,
1274-
);
12751296

1276-
let mut col = max(chpos, linechpos) - min(chpos, linechpos);
1297+
let linechpos =
1298+
linebpos.to_u32() - self.calc_utf16_offset(f, linebpos, &mut line_state);
1299+
let chpos = pos.to_u32() - self.calc_utf16_offset(f, pos, &mut ch_state);
1300+
1301+
debug_assert!(
1302+
chpos >= linechpos,
1303+
"{}: chpos = {:?}; linechpos = {:?};",
1304+
f.name,
1305+
chpos,
1306+
linechpos,
1307+
);
12771308

1309+
let mut col = chpos - linechpos;
1310+
let mut name = None;
12781311
if let Some(orig) = &orig {
12791312
if let Some(token) = orig
12801313
.lookup_token(line, col)
@@ -1298,7 +1331,9 @@ impl SourceMap {
12981331
}
12991332
}
13001333

1301-
let name_idx = name.map(|name| builder.add_name(name));
1334+
let name_idx = name
1335+
.or_else(|| config.name_for_bytepos(pos))
1336+
.map(|name| builder.add_name(name));
13021337

13031338
builder.add_raw(lc.line, lc.col, line, col, Some(src_id), name_idx);
13041339
prev_dst_line = lc.line;
@@ -1434,6 +1469,20 @@ impl SourceMapGenConfig for DefaultSourceMapGenConfig {
14341469
}
14351470
}
14361471

1472+
/// Stores the state of the last conversion between BytePos and CharPos.
1473+
#[derive(Debug, Clone, Default)]
1474+
pub struct ByteToCharPosState {
1475+
/// The last BytePos to convert.
1476+
pos: BytePos,
1477+
1478+
/// The total number of extra chars in the UTF-8 encoding.
1479+
total_extra_bytes: u32,
1480+
1481+
/// The index of the last MultiByteChar read to compute the extra bytes of
1482+
/// the last conversion.
1483+
mbc_index: usize,
1484+
}
1485+
14371486
// _____________________________________________________________________________
14381487
// Tests
14391488
//
@@ -1653,6 +1702,52 @@ mod tests {
16531702
assert!(sm.merge_spans(span1, span2).is_none());
16541703
}
16551704

1705+
#[test]
1706+
fn calc_utf16_offset() {
1707+
let input = "t¢e∆s💩t";
1708+
let sm = SourceMap::new(FilePathMapping::empty());
1709+
let file = sm.new_source_file(PathBuf::from("blork.rs").into(), input.to_string());
1710+
1711+
let mut state = ByteToCharPosState::default();
1712+
let mut bpos = file.start_pos;
1713+
let mut cpos = CharPos(bpos.to_usize());
1714+
for c in input.chars() {
1715+
let actual = bpos.to_u32() - sm.calc_utf16_offset(&file, bpos, &mut state);
1716+
1717+
assert_eq!(actual, cpos.to_u32());
1718+
1719+
bpos = bpos + BytePos(c.len_utf8() as u32);
1720+
cpos = cpos + CharPos(c.len_utf16());
1721+
}
1722+
1723+
for c in input.chars().rev() {
1724+
bpos = bpos - BytePos(c.len_utf8() as u32);
1725+
cpos = cpos - CharPos(c.len_utf16());
1726+
1727+
let actual = bpos.to_u32() - sm.calc_utf16_offset(&file, bpos, &mut state);
1728+
1729+
assert_eq!(actual, cpos.to_u32());
1730+
}
1731+
}
1732+
1733+
#[test]
1734+
fn bytepos_to_charpos() {
1735+
let input = "t¢e∆s💩t";
1736+
let sm = SourceMap::new(FilePathMapping::empty());
1737+
let file = sm.new_source_file(PathBuf::from("blork.rs").into(), input.to_string());
1738+
1739+
let mut bpos = file.start_pos;
1740+
let mut cpos = CharPos(0);
1741+
for c in input.chars() {
1742+
let actual = sm.bytepos_to_file_charpos_with(&file, bpos);
1743+
1744+
assert_eq!(actual, cpos);
1745+
1746+
bpos = bpos + BytePos(c.len_utf8() as u32);
1747+
cpos = cpos + CharPos(c.len_utf16());
1748+
}
1749+
}
1750+
16561751
/// Returns the span corresponding to the `n`th occurrence of
16571752
/// `substring` in `source_text`.
16581753
trait SourceMapExtension {

‎crates/swc_common/src/syntax_pos.rs

+18-1
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,21 @@ pub struct MultiByteChar {
737737
pub bytes: u8,
738738
}
739739

740+
impl MultiByteChar {
741+
/// Computes the extra number of UTF-8 bytes necessary to encode a code
742+
/// point, compared to UTF-16 encoding.
743+
///
744+
/// 1, 2, and 3 UTF-8 bytes encode into 1 UTF-16 char, but 4 UTF-8 bytes
745+
/// encode into 2.
746+
pub fn byte_to_char_diff(&self) -> u8 {
747+
if self.bytes == 4 {
748+
2
749+
} else {
750+
self.bytes - 1
751+
}
752+
}
753+
}
754+
740755
/// Identifies an offset of a non-narrow character in a SourceFile
741756
#[cfg_attr(
742757
any(feature = "rkyv-impl", feature = "rkyv-bytecheck-impl"),
@@ -1002,7 +1017,9 @@ pub trait Pos {
10021017
/// - Values larger than `u32::MAX - 2^16` are reserved for the comments.
10031018
///
10041019
/// `u32::MAX` is special value used to generate source map entries.
1005-
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Debug, Serialize, Deserialize)]
1020+
#[derive(
1021+
Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Debug, Serialize, Deserialize, Default,
1022+
)]
10061023
#[serde(transparent)]
10071024
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
10081025
#[cfg_attr(

‎crates/swc_estree_compat/src/babelify/mod.rs

+1-15
Original file line numberDiff line numberDiff line change
@@ -40,21 +40,7 @@ impl Context {
4040
return (None, None);
4141
}
4242

43-
// We rename this to feel more comfortable while doing math.
44-
let start_offset = self.fm.start_pos;
45-
46-
let mut start = span.lo.0 - start_offset.0;
47-
let mut end = span.hi.0 - start_offset.0;
48-
49-
for mb in self.fm.multibyte_chars.iter() {
50-
if mb.pos < span.lo {
51-
start -= (mb.bytes - 1) as u32;
52-
}
53-
54-
if mb.pos < span.hi {
55-
end -= (mb.bytes - 1) as u32;
56-
}
57-
}
43+
let (start, end) = self.cm.span_to_char_offset(&self.fm, span);
5844

5945
(Some(start), Some(end))
6046
}

1 commit comments

Comments
 (1)

github-actions[bot] commented on Dec 4, 2022

@github-actions[bot]

Benchmark

Benchmark suite Current: a203fdb Previous: 01232f0 Ratio
es/full/bugs-1 433858 ns/iter (± 60049) 360532 ns/iter (± 22667) 1.20
es/full/minify/libraries/antd 2421609627 ns/iter (± 69302733) 2059896790 ns/iter (± 21826036) 1.18
es/full/minify/libraries/d3 471887492 ns/iter (± 17014566) 472027800 ns/iter (± 23829735) 1.00
es/full/minify/libraries/echarts 2117942431 ns/iter (± 71712587) 1741553955 ns/iter (± 49207818) 1.22
es/full/minify/libraries/jquery 132923529 ns/iter (± 6559938) 109253087 ns/iter (± 2209958) 1.22
es/full/minify/libraries/lodash 152517116 ns/iter (± 5581911) 125198057 ns/iter (± 3714508) 1.22
es/full/minify/libraries/moment 84859642 ns/iter (± 52635634) 64127061 ns/iter (± 7665962) 1.32
es/full/minify/libraries/react 28651895 ns/iter (± 15209523) 21947713 ns/iter (± 460113) 1.31
es/full/minify/libraries/terser 463019636 ns/iter (± 33428381) 331681260 ns/iter (± 15671794) 1.40
es/full/minify/libraries/three 736593634 ns/iter (± 66658898) 612215724 ns/iter (± 23303045) 1.20
es/full/minify/libraries/typescript 4838105034 ns/iter (± 148445746) 3793185807 ns/iter (± 58044728) 1.28
es/full/minify/libraries/victory 1112553848 ns/iter (± 83094318) 982142784 ns/iter (± 84583244) 1.13
es/full/minify/libraries/vue 215182264 ns/iter (± 24983236) 203116835 ns/iter (± 19896332) 1.06
es/full/codegen/es3 41999 ns/iter (± 886) 35936 ns/iter (± 5000) 1.17
es/full/codegen/es5 41891 ns/iter (± 2310) 34247 ns/iter (± 2453) 1.22
es/full/codegen/es2015 41479 ns/iter (± 2752) 34262 ns/iter (± 1185) 1.21
es/full/codegen/es2016 41953 ns/iter (± 5247) 34741 ns/iter (± 1521) 1.21
es/full/codegen/es2017 41691 ns/iter (± 3435) 34367 ns/iter (± 1609) 1.21
es/full/codegen/es2018 43614 ns/iter (± 25098) 34532 ns/iter (± 2530) 1.26
es/full/codegen/es2019 41370 ns/iter (± 4333) 34565 ns/iter (± 1558) 1.20
es/full/codegen/es2020 42106 ns/iter (± 6950) 34832 ns/iter (± 3878) 1.21
es/full/all/es3 237538178 ns/iter (± 28711261) 237315028 ns/iter (± 22871198) 1.00
es/full/all/es5 225625443 ns/iter (± 17751187) 225492813 ns/iter (± 19749924) 1.00
es/full/all/es2015 179765309 ns/iter (± 17404803) 181221843 ns/iter (± 15816575) 0.99
es/full/all/es2016 182401069 ns/iter (± 15335932) 178068147 ns/iter (± 19930294) 1.02
es/full/all/es2017 177936455 ns/iter (± 15811367) 174244091 ns/iter (± 17486793) 1.02
es/full/all/es2018 175847881 ns/iter (± 19239088) 172144942 ns/iter (± 14584821) 1.02
es/full/all/es2019 175094541 ns/iter (± 17610050) 169395980 ns/iter (± 21142491) 1.03
es/full/all/es2020 168017276 ns/iter (± 15066806) 167952429 ns/iter (± 16215571) 1.00
es/full/parser 873718 ns/iter (± 35587) 765715 ns/iter (± 78211) 1.14
es/full/base/fixer 32642 ns/iter (± 1935) 28320 ns/iter (± 5716) 1.15
es/full/base/resolver_and_hygiene 117221 ns/iter (± 3824) 98593 ns/iter (± 21215) 1.19
serialization of ast node 252 ns/iter (± 13) 208 ns/iter (± 7) 1.21
serialization of serde 266 ns/iter (± 20) 222 ns/iter (± 6) 1.20

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.