From 541590f9d969135a6b66a9e2ab732265c2c38939 Mon Sep 17 00:00:00 2001 From: Folkert Date: Wed, 18 May 2022 15:39:58 +0200 Subject: [PATCH 1/3] use simd in actual comment parsing --- compiler/parse/src/blankspace.rs | 93 +++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 2 deletions(-) diff --git a/compiler/parse/src/blankspace.rs b/compiler/parse/src/blankspace.rs index 5b623fcc47..a4e3406dbf 100644 --- a/compiler/parse/src/blankspace.rs +++ b/compiler/parse/src/blankspace.rs @@ -359,8 +359,8 @@ fn eat_line_comment<'a>( mut multiline: bool, mut comments_and_newlines: Vec<'a, CommentOrNewline<'a>>, ) -> SpaceState<'a> { - let mut index = 0; - let bytes = state.bytes(); + let mut index = state.pos().offset as usize; + let bytes = state.original_bytes(); let length = bytes.len(); 'outer: loop { @@ -432,6 +432,95 @@ fn eat_line_comment<'a>( let loop_start = index; + #[cfg(target_arch = "x86_64")] + { + use std::arch::x86_64::*; + + // a bytestring with the three characters we're looking for (the rest is ignored) + let needle = b"\r\n\t============="; + let needle = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) }; + + while index < length { + let remaining = length - index; + let chunk = if remaining < 16 { remaining as i32 } else { 16 }; + + // the source bytes we'll be looking at + let haystack = unsafe { _mm_loadu_si128(bytes.as_ptr().add(index) as *const _) }; + + // use first 3 characters of needle, first chunk` characters of haystack + // finds the first index where one of the `needle` characters occurs + // or 16 when none of the needle characters occur + let first_special_char = + unsafe { _mm_cmpestri(needle, 3, haystack, chunk, _SIDD_CMP_EQUAL_ANY) }; + + // we've made `first_special_char` characters of progress + index += first_special_char as usize; + state = state.advance(first_special_char as usize); + + // if we found a special char, let the outer loop handle it + if first_special_char != 16 { + match bytes[index] { + b'\t' => unreachable!(), + b'\n' => { + let comment = + unsafe { std::str::from_utf8_unchecked(&bytes[loop_start..index]) }; + + if is_doc_comment { + comments_and_newlines.push(CommentOrNewline::DocComment(comment)); + } else { + comments_and_newlines.push(CommentOrNewline::LineComment(comment)); + } + state = state.advance_newline(); + multiline = true; + + index += 1; + while index < length { + match bytes[index] { + b' ' => { + state = state.advance(1); + } + b'\n' => { + state = state.advance_newline(); + multiline = true; + comments_and_newlines.push(CommentOrNewline::Newline); + } + b'\r' => { + state = state.advance_newline(); + } + b'\t' => unreachable!(), + b'#' => { + state = state.advance(1); + index += 1; + continue 'outer; + } + _ => break, + } + + index += 1; + } + + return SpaceState { + state, + multiline, + comments_and_newlines, + }; + } + b'\r' => { + state = state.advance_newline(); + index += 1; + } + odd_character => { + unreachable!( + "unexpected_character {} {}", + odd_character, odd_character as char + ) + } + } + } + } + } + + #[cfg(not(target_arch = "x86_64"))] while index < length { match bytes[index] { b'\t' => unreachable!(), From b390907610e35877c2e76492004b2449e7f63f0f Mon Sep 17 00:00:00 2001 From: Folkert Date: Wed, 18 May 2022 18:06:57 +0200 Subject: [PATCH 2/3] stay within bounds --- compiler/parse/src/blankspace.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/compiler/parse/src/blankspace.rs b/compiler/parse/src/blankspace.rs index a4e3406dbf..b25aa9aef1 100644 --- a/compiler/parse/src/blankspace.rs +++ b/compiler/parse/src/blankspace.rs @@ -278,7 +278,7 @@ fn fast_eat_spaces(state: &State) -> FastSpaceState { }; // we've made `first_special_char` characters of progress - index += first_special_char as usize; + index += usize::min(first_special_char as usize, remaining); // if we found a special char, let the outer loop handle it if first_special_char != 16 { @@ -454,8 +454,9 @@ fn eat_line_comment<'a>( unsafe { _mm_cmpestri(needle, 3, haystack, chunk, _SIDD_CMP_EQUAL_ANY) }; // we've made `first_special_char` characters of progress - index += first_special_char as usize; - state = state.advance(first_special_char as usize); + let progress = usize::min(first_special_char as usize, remaining); + index += progress; + state = state.advance(progress); // if we found a special char, let the outer loop handle it if first_special_char != 16 { From c785577636f116df3557f6094b343dea7d9dabfa Mon Sep 17 00:00:00 2001 From: Folkert Date: Fri, 20 May 2022 13:14:42 +0200 Subject: [PATCH 3/3] fix comment --- compiler/parse/src/blankspace.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/compiler/parse/src/blankspace.rs b/compiler/parse/src/blankspace.rs index b25aa9aef1..1804cdc284 100644 --- a/compiler/parse/src/blankspace.rs +++ b/compiler/parse/src/blankspace.rs @@ -458,7 +458,6 @@ fn eat_line_comment<'a>( index += progress; state = state.advance(progress); - // if we found a special char, let the outer loop handle it if first_special_char != 16 { match bytes[index] { b'\t' => unreachable!(),