-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(common): Mark \r
as a line ending
#6752
Merged
Merged
Changes from 5 commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
5c11c75
Remove sse2 version
kdy1 21015c9
Fix
kdy1 78d8c39
feedback
kdy1 2c6d0d4
Add a test
kdy1 992b8d8
fixup
kdy1 ecceb88
feedback
kdy1 a61a26c
Remove dbg!
kdy1 e5f3631
Add a test
kdy1 dd69bb1
fix
kdy1 5bb8255
Remove wrong
kdy1 ed45a78
Merge branch 'main' into issue-6694
swc-bot File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,8 +25,9 @@ pub fn analyze_source_file( | |
let mut non_narrow_chars = vec![]; | ||
|
||
// Calls the right implementation, depending on hardware support available. | ||
analyze_source_file_dispatch( | ||
analyze_source_file_generic( | ||
src, | ||
src.len(), | ||
source_file_start_pos, | ||
&mut lines, | ||
&mut multi_byte_chars, | ||
|
@@ -47,167 +48,6 @@ pub fn analyze_source_file( | |
(lines, multi_byte_chars, non_narrow_chars) | ||
} | ||
|
||
cfg_if::cfg_if! { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not ideal, but I'm not sure if this optimization is really required. |
||
if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))] { | ||
fn analyze_source_file_dispatch(src: &str, | ||
source_file_start_pos: BytePos, | ||
lines: &mut Vec<BytePos>, | ||
multi_byte_chars: &mut Vec<MultiByteChar>, | ||
non_narrow_chars: &mut Vec<NonNarrowChar>) { | ||
if is_x86_feature_detected!("sse2") && cfg!(not(miri)) { | ||
unsafe { | ||
analyze_source_file_sse2(src, | ||
source_file_start_pos, | ||
lines, | ||
multi_byte_chars, | ||
non_narrow_chars); | ||
} | ||
} else { | ||
analyze_source_file_generic(src, | ||
src.len(), | ||
source_file_start_pos, | ||
lines, | ||
multi_byte_chars, | ||
non_narrow_chars); | ||
|
||
} | ||
} | ||
|
||
/// Checks 16 byte chunks of text at a time. If the chunk contains | ||
/// something other than printable ASCII characters and newlines, the | ||
/// function falls back to the generic implementation. Otherwise it uses | ||
/// SSE2 intrinsics to quickly find all newlines. | ||
#[target_feature(enable = "sse2")] | ||
unsafe fn analyze_source_file_sse2(src: &str, | ||
output_offset: BytePos, | ||
lines: &mut Vec<BytePos>, | ||
multi_byte_chars: &mut Vec<MultiByteChar>, | ||
non_narrow_chars: &mut Vec<NonNarrowChar>) { | ||
#[cfg(target_arch = "x86")] | ||
use std::arch::x86::*; | ||
#[cfg(target_arch = "x86_64")] | ||
use std::arch::x86_64::*; | ||
|
||
const CHUNK_SIZE: usize = 16; | ||
|
||
let src_bytes = src.as_bytes(); | ||
|
||
let chunk_count = src.len() / CHUNK_SIZE; | ||
|
||
// This variable keeps track of where we should start decoding a | ||
// chunk. If a multi-byte character spans across chunk boundaries, | ||
// we need to skip that part in the next chunk because we already | ||
// handled it. | ||
let mut intra_chunk_offset = 0; | ||
|
||
for chunk_index in 0 .. chunk_count { | ||
let ptr = src_bytes.as_ptr() as *const __m128i; | ||
// We don't know if the pointer is aligned to 16 bytes, so we | ||
// use `loadu`, which supports unaligned loading. | ||
let chunk = _mm_loadu_si128(ptr.add(chunk_index)); | ||
|
||
// For character in the chunk, see if its byte value is < 0, which | ||
// indicates that it's part of a UTF-8 char. | ||
let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)); | ||
// Create a bit mask from the comparison results. | ||
let multibyte_mask = _mm_movemask_epi8(multibyte_test); | ||
|
||
// If the bit mask is all zero, we only have ASCII chars here: | ||
if multibyte_mask == 0 { | ||
assert!(intra_chunk_offset == 0); | ||
|
||
// Check if there are any control characters in the chunk. All | ||
// control characters that we can encounter at this point have a | ||
// byte value less than 32 or ... | ||
let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)); | ||
let control_char_mask0 = _mm_movemask_epi8(control_char_test0); | ||
|
||
// ... it's the ASCII 'DEL' character with a value of 127. | ||
let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)); | ||
let control_char_mask1 = _mm_movemask_epi8(control_char_test1); | ||
|
||
let control_char_mask = control_char_mask0 | control_char_mask1; | ||
|
||
if control_char_mask != 0 { | ||
// Check for newlines in the chunk | ||
let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)); | ||
let newlines_mask = _mm_movemask_epi8(newlines_test); | ||
|
||
if control_char_mask == newlines_mask { | ||
// All control characters are newlines, record them | ||
let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32; | ||
let output_offset = output_offset + | ||
BytePos::from_usize(chunk_index * CHUNK_SIZE + 1); | ||
|
||
loop { | ||
let index = newlines_mask.trailing_zeros(); | ||
|
||
if index >= CHUNK_SIZE as u32 { | ||
// We have arrived at the end of the chunk. | ||
break | ||
} | ||
|
||
lines.push(BytePos(index) + output_offset); | ||
|
||
// Clear the bit, so we can find the next one. | ||
newlines_mask &= (!1) << index; | ||
} | ||
|
||
// We are done for this chunk. All control characters were | ||
// newlines and we took care of those. | ||
continue | ||
} else { | ||
// Some of the control characters are not newlines, | ||
// fall through to the slow path below. | ||
} | ||
} else { | ||
// No control characters, nothing to record for this chunk | ||
continue | ||
} | ||
} | ||
|
||
// The slow path. | ||
// There are control chars in here, fallback to generic decoding. | ||
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset; | ||
intra_chunk_offset = analyze_source_file_generic( | ||
&src[scan_start .. ], | ||
CHUNK_SIZE - intra_chunk_offset, | ||
BytePos::from_usize(scan_start) + output_offset, | ||
lines, | ||
multi_byte_chars, | ||
non_narrow_chars | ||
); | ||
} | ||
|
||
// There might still be a tail left to analyze | ||
let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset; | ||
if tail_start < src.len() { | ||
analyze_source_file_generic(&src[tail_start as usize ..], | ||
src.len() - tail_start, | ||
output_offset + BytePos::from_usize(tail_start), | ||
lines, | ||
multi_byte_chars, | ||
non_narrow_chars); | ||
} | ||
} | ||
} else { | ||
|
||
// The target (or compiler version) does not support SSE2 ... | ||
fn analyze_source_file_dispatch(src: &str, | ||
source_file_start_pos: BytePos, | ||
lines: &mut Vec<BytePos>, | ||
multi_byte_chars: &mut Vec<MultiByteChar>, | ||
non_narrow_chars: &mut Vec<NonNarrowChar>) { | ||
analyze_source_file_generic(src, | ||
src.len(), | ||
source_file_start_pos, | ||
lines, | ||
multi_byte_chars, | ||
non_narrow_chars); | ||
} | ||
} | ||
} | ||
|
||
// `scan_len` determines the number of bytes in `src` to scan. Note that the | ||
// function can read past `scan_len` if a multi-byte character start within the | ||
// range but extends past it. The overflow is returned by the function. | ||
|
@@ -240,6 +80,14 @@ fn analyze_source_file_generic( | |
let pos = BytePos::from_usize(i) + output_offset; | ||
|
||
match byte { | ||
b'\r' => { | ||
lines.push(pos + BytePos(1)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Won't this need to be |
||
if let Some(b'\n') = src_bytes.get(i as usize + 1) { | ||
i += 2; | ||
continue; | ||
} | ||
} | ||
|
||
b'\n' => { | ||
lines.push(pos + BytePos(1)); | ||
} | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are the debug and prints supposed to still be here?