Merge branch 'johan/functional-ssfs'

This merge splits the styled string splitter into multiple functions for readability and maintainability. This makes the BenchmarkHighlightedSearch about 10% slower, but I think it's worth the tradeoff.
2024-09-11 12:15:43 +03:00 · 2023-11-11 15:39:25 +01:00 · 2023-11-11 15:39:25 +01:00 · cfdf99d3e3
commit cfdf99d3e3
parent c4e94374ac 3676bbfd41
4 changed files with 343 additions and 258 deletions
--- a/m/ansiTokenizer.go
+++ b/m/ansiTokenizer.go
@ -424,194 +424,10 @@ type _StyledString struct {
 	Style  twin.Style
 }

-type parseState int
-
-const (
-	initial parseState = iota
-	justSawEsc
-	inStyle
-	gotOsc      // OSC = Operating System Command = ESC]
-	gotOsc8     // ESC]8
-	gotOsc8Semi // ESC]8;
-	inUrl       // After ESC]8;;
-	inUrlGotEsc // Expecting a \ now to terminate the URL
-)
-
-func styledStringsFromString(s string) styledStringsWithTrailer {
-	if !strings.ContainsAny(s, "\x1b") {
-		// This shortcut makes BenchmarkPlainTextSearch() perform a lot better
-		return styledStringsWithTrailer{
-			trailer: twin.StyleDefault,
-			styledStrings: []_StyledString{{
-				String: s,
-				Style:  twin.StyleDefault,
-			}},
-		}
-	}
-
-	trailer := twin.StyleDefault
-	parts := make([]_StyledString, 1)
-
-	state := initial
-	escIndex := -1 // Byte index into s
-	partStart := 0 // Byte index into s
-	urlStart := -1 // Byte index into s
-	style := twin.StyleDefault
-	for byteIndex, char := range s {
-		if state == initial {
-			if char == '\x1b' {
-				escIndex = byteIndex
-				state = justSawEsc
-			}
-			continue
-		} else if state == justSawEsc {
-			if char == '\x1b' {
-				escIndex = byteIndex
-				state = justSawEsc
-			} else if char == '[' {
-				state = inStyle
-			} else if char == ']' {
-				state = gotOsc
-			} else {
-				state = initial
-			}
-			continue
-		} else if state == inStyle {
-			if char == '\x1b' {
-				escIndex = byteIndex
-				state = justSawEsc
-			} else if (char >= '0' && char <= '9') || char == ';' {
-				// Stay in style
-			} else if char == 'm' {
-				if partStart < escIndex {
-					// Consume the most recent part
-					parts = append(parts, _StyledString{
-						String: s[partStart:escIndex],
-						Style:  style,
-					})
-				}
-
-				style = updateStyle(style, s[escIndex:byteIndex+1])
-				partStart = byteIndex + 1 // Next part starts after this 'm'
-				state = initial
-			} else if char == 'K' {
-				ansiStyle := s[escIndex : byteIndex+1]
-				if ansiStyle != "\x1b[K" && ansiStyle != "\x1b[0K" {
-					// Not a supported clear operation, just treat the whole thing as plain text
-					state = initial
-					continue
-				}
-
-				// Handle clear-to-end-of-line
-
-				if partStart < escIndex {
-					// Consume the most recent part
-					parts = append(parts, _StyledString{
-						String: s[partStart:escIndex],
-						Style:  style,
-					})
-				}
-
-				trailer = style
-				partStart = byteIndex + 1 // Next part starts after this 'K'
-				state = initial
-			} else {
-				// Unsupported sequence, just treat the whole thing as plain text
-				state = initial
-			}
-			continue
-		} else if state == gotOsc {
-			if char == '8' {
-				state = gotOsc8
-			} else {
-				state = initial
-			}
-			continue
-		} else if state == gotOsc8 {
-			if char == ';' {
-				state = gotOsc8Semi
-			} else {
-				state = initial
-			}
-			continue
-		} else if state == gotOsc8Semi {
-			if char == ';' {
-				urlStart = byteIndex + 1
-				state = inUrl
-			} else {
-				state = initial
-			}
-			continue
-		} else if state == inUrl {
-			// Ref: https://stackoverflow.com/a/1547940/473672
-			const validChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;="
-			if char == '\x1b' {
-				state = inUrlGotEsc
-			} else if char == '\x07' {
-				// End of URL
-
-				if partStart < escIndex {
-					// Consume the most recent part
-					parts = append(parts, _StyledString{
-						String: s[partStart:escIndex],
-						Style:  style,
-					})
-				}
-				partStart = byteIndex + 1
-
-				url := s[urlStart:byteIndex]
-				style = style.WithHyperlink(&url)
-				state = initial
-			} else if strings.ContainsRune(validChars, char) {
-				// Stay in URL
-			} else {
-				// Invalid URL character, just treat the whole thing as plain text
-				state = initial
-			}
-			continue
-		} else if state == inUrlGotEsc {
-			if char == '\\' {
-				// End of URL
-
-				if partStart < escIndex {
-					// Consume the most recent part
-					parts = append(parts, _StyledString{
-						String: s[partStart:escIndex],
-						Style:  style,
-					})
-				}
-				partStart = byteIndex + 1
-
-				url := s[urlStart : byteIndex-1]
-				style = style.WithHyperlink(&url)
-				state = initial
-			} else {
-				// Broken ending, just treat the whole thing as plain text
-				state = initial
-			}
-			continue
-		}
-
-		panic("We should never get here")
-	}
-
-	if partStart < len(s) {
-		// Consume the most recent part
-		parts = append(parts, _StyledString{
-			String: s[partStart:],
-			Style:  style,
-		})
-	}
-
-	return styledStringsWithTrailer{
-		styledStrings: parts,
-		trailer:       trailer,
-	}
-}
-
-// updateStyle parses a string of the form "ESC[33m" into changes to style
-func updateStyle(style twin.Style, escapeSequence string) twin.Style {
-	numbers := strings.Split(escapeSequence[2:len(escapeSequence)-1], ";")
+// rawUpdateStyle parses a string of the form "33m" into changes to style. This
+// is what comes after ESC[ in an ANSI SGR sequence.
+func rawUpdateStyle(style twin.Style, escapeSequenceWithoutHeader string) twin.Style {
+	numbers := strings.Split(escapeSequenceWithoutHeader[:len(escapeSequenceWithoutHeader)-1], ";")
 	index := 0
 	for index < len(numbers) {
 		number := numbers[index]
--- a/m/ansiTokenizer_test.go
+++ b/m/ansiTokenizer_test.go
@ -28,76 +28,78 @@ func cellsToPlainString(cells []twin.Cell) string {
 // without logging any errors
 func TestTokenize(t *testing.T) {
 	for _, fileName := range getTestFiles() {
-		file, err := os.Open(fileName)
-		if err != nil {
-			t.Errorf("Error opening file <%s>: %s", fileName, err.Error())
-			continue
-		}
-		defer func() {
-			if err := file.Close(); err != nil {
-				panic(err)
+		t.Run(fileName, func(t *testing.T) {
+			file, err := os.Open(fileName)
+			if err != nil {
+				t.Errorf("Error opening file <%s>: %s", fileName, err.Error())
+				return
 			}
-		}()
+			defer func() {
+				if err := file.Close(); err != nil {
+					panic(err)
+				}
+			}()

-		myReader := NewReaderFromStream(fileName, file)
-		for !myReader.done.Load() {
-		}
-
-		for lineNumber := 1; lineNumber <= myReader.GetLineCount(); lineNumber++ {
-			line := myReader.GetLine(lineNumber)
-			lineNumber++
-
-			var loglines strings.Builder
-			log.SetOutput(&loglines)
-
-			tokens := cellsFromString(line.raw).Cells
-			plainString := withoutFormatting(line.raw)
-			if len(tokens) != utf8.RuneCountInString(plainString) {
-				t.Errorf("%s:%d: len(tokens)=%d, len(plainString)=%d for: <%s>",
-					fileName, lineNumber,
-					len(tokens), utf8.RuneCountInString(plainString), line.raw)
-				continue
+			myReader := NewReaderFromStream(fileName, file)
+			for !myReader.done.Load() {
 			}

-			// Tokens and plain have the same lengths, compare contents
-			plainStringChars := []rune(plainString)
-			for index, plainChar := range plainStringChars {
-				cellChar := tokens[index]
-				if cellChar.Rune == plainChar {
+			for lineNumber := 1; lineNumber <= myReader.GetLineCount(); lineNumber++ {
+				line := myReader.GetLine(lineNumber)
+				lineNumber++
+
+				var loglines strings.Builder
+				log.SetOutput(&loglines)
+
+				tokens := cellsFromString(line.raw).Cells
+				plainString := withoutFormatting(line.raw)
+				if len(tokens) != utf8.RuneCountInString(plainString) {
+					t.Errorf("%s:%d: len(tokens)=%d, len(plainString)=%d for: <%s>",
+						fileName, lineNumber,
+						len(tokens), utf8.RuneCountInString(plainString), line.raw)
 					continue
 				}

-				if cellChar.Rune == '•' && plainChar == 'o' {
-					// Pretty bullets on man pages
+				// Tokens and plain have the same lengths, compare contents
+				plainStringChars := []rune(plainString)
+				for index, plainChar := range plainStringChars {
+					cellChar := tokens[index]
+					if cellChar.Rune == plainChar {
+						continue
+					}
+
+					if cellChar.Rune == '•' && plainChar == 'o' {
+						// Pretty bullets on man pages
+						continue
+					}
+
+					// Chars mismatch!
+					plainStringFromCells := cellsToPlainString(tokens)
+					positionMarker := strings.Repeat(" ", index) + "^"
+					cellCharString := string(cellChar.Rune)
+					if !twin.Printable(cellChar.Rune) {
+						cellCharString = fmt.Sprint(int(cellChar.Rune))
+					}
+					plainCharString := string(plainChar)
+					if !twin.Printable(plainChar) {
+						plainCharString = fmt.Sprint(int(plainChar))
+					}
+					t.Errorf("%s:%d, 0-based column %d: cell char <%s> != plain char <%s>:\nPlain: %s\nCells: %s\n       %s",
+						fileName, lineNumber, index,
+						cellCharString, plainCharString,
+						plainString,
+						plainStringFromCells,
+						positionMarker,
+					)
+					break
+				}
+
+				if len(loglines.String()) != 0 {
+					t.Errorf("%s: %s", fileName, loglines.String())
 					continue
 				}
-
-				// Chars mismatch!
-				plainStringFromCells := cellsToPlainString(tokens)
-				positionMarker := strings.Repeat(" ", index) + "^"
-				cellCharString := string(cellChar.Rune)
-				if !twin.Printable(cellChar.Rune) {
-					cellCharString = fmt.Sprint(int(cellChar.Rune))
-				}
-				plainCharString := string(plainChar)
-				if !twin.Printable(plainChar) {
-					plainCharString = fmt.Sprint(int(plainChar))
-				}
-				t.Errorf("%s:%d, 0-based column %d: cell char <%s> != plain char <%s>:\nPlain: %s\nCells: %s\n       %s",
-					fileName, lineNumber, index,
-					cellCharString, plainCharString,
-					plainString,
-					plainStringFromCells,
-					positionMarker,
-				)
-				break
 			}
-
-			if len(loglines.String()) != 0 {
-				t.Errorf("%s: %s", fileName, loglines.String())
-				continue
-			}
-		}
+		})
 	}
 }

@ -229,8 +231,8 @@ func TestConsumeCompositeColorIncomplete24Bit(t *testing.T) {
 	assert.Assert(t, color == nil)
 }

-func TestUpdateStyle(t *testing.T) {
-	numberColored := updateStyle(twin.StyleDefault, "\x1b[33m")
+func TestRawUpdateStyle(t *testing.T) {
+	numberColored := rawUpdateStyle(twin.StyleDefault, "33m")
 	assert.Equal(t, numberColored, twin.StyleDefault.Foreground(twin.NewColor16(3)))
 }

@ -287,15 +289,18 @@ func TestHyperlink_incomplete(t *testing.T) {
 	complete := "a\x1b]8;;X\x1b\\"

 	for l := len(complete) - 1; l >= 0; l-- {
-		tokens := cellsFromString(complete[:l]).Cells
+		incomplete := complete[:l]
+		t.Run(fmt.Sprintf("l=%d incomplete=<%s>", l, strings.ReplaceAll(incomplete, "\x1b", "ESC")), func(t *testing.T) {
+			tokens := cellsFromString(incomplete).Cells

-		for i := 0; i < l; i++ {
-			if complete[i] == '\x1b' {
-				// These get special rendering, if everything else matches
-				// that's good enough.
-				continue
+			for i := 0; i < l; i++ {
+				if complete[i] == '\x1b' {
+					// These get special rendering, if everything else matches
+					// that's good enough.
+					continue
+				}
+				assert.Equal(t, tokens[i], twin.Cell{Rune: rune(complete[i]), Style: twin.StyleDefault})
 			}
-			assert.Equal(t, tokens[i], twin.Cell{Rune: rune(complete[i]), Style: twin.StyleDefault})
-		}
+		})
 	}
 }
--- a/m/styledStringSplitter.go
+++ b/m/styledStringSplitter.go
@ -0,0 +1,237 @@
+package m
+
+import (
+	"strings"
+	"unicode/utf8"
+
+	"github.com/walles/moar/twin"
+)
+
+const esc = '\x1b'
+
+type styledStringSplitter struct {
+	input             string
+	nextByteIndex     int
+	previousByteIndex int
+
+	inProgressString strings.Builder
+	inProgressStyle  twin.Style
+
+	parts   []_StyledString
+	trailer twin.Style
+}
+
+func styledStringsFromString(s string) styledStringsWithTrailer {
+	if !strings.ContainsAny(s, "\x1b") {
+		// This shortcut makes BenchmarkPlainTextSearch() perform a lot better
+		return styledStringsWithTrailer{
+			trailer: twin.StyleDefault,
+			styledStrings: []_StyledString{{
+				String: s,
+				Style:  twin.StyleDefault,
+			}},
+		}
+	}
+
+	splitter := styledStringSplitter{
+		input: s,
+	}
+	splitter.run()
+
+	return styledStringsWithTrailer{
+		trailer:       splitter.trailer,
+		styledStrings: splitter.parts,
+	}
+}
+
+func (s *styledStringSplitter) nextChar() rune {
+	if s.nextByteIndex >= len(s.input) {
+		s.previousByteIndex = s.nextByteIndex
+		return -1
+	}
+
+	char, size := utf8.DecodeRuneInString(s.input[s.nextByteIndex:])
+	s.previousByteIndex = s.nextByteIndex
+	s.nextByteIndex += size
+	return char
+}
+
+// Returns whatever the last call to nextChar() returned
+func (s *styledStringSplitter) lastChar() rune {
+	if s.previousByteIndex >= len(s.input) {
+		return -1
+	}
+
+	char, _ := utf8.DecodeRuneInString(s.input[s.previousByteIndex:])
+	return char
+}
+
+func (s *styledStringSplitter) run() {
+	char := s.nextChar()
+	for {
+		if char == -1 {
+			s.finalizeCurrentPart()
+			return
+		}
+
+		if char == esc {
+			escIndex := s.previousByteIndex
+			success := s.handleEscape()
+			if !success {
+				// Somewhere in handleEscape(), we got a character that was
+				// unexpected. We need to treat everything up to before that
+				// character as just plain runes.
+				for _, char := range s.input[escIndex:s.previousByteIndex] {
+					s.handleRune(char)
+				}
+
+				// Start over with the character that caused the problem
+				char = s.lastChar()
+				continue
+			}
+		} else {
+			s.handleRune(char)
+		}
+
+		char = s.nextChar()
+	}
+}
+
+func (s *styledStringSplitter) handleRune(char rune) {
+	s.inProgressString.WriteRune(char)
+}
+
+func (s *styledStringSplitter) handleEscape() bool {
+	char := s.nextChar()
+	if char == '[' || char == ']' {
+		// Got the start of a CSI or an OSC sequence
+		return s.consumeControlSequence(char)
+	}
+
+	return false
+}
+
+func (s *styledStringSplitter) consumeControlSequence(charAfterEsc rune) bool {
+	// Points to right after "ESC["
+	startIndex := s.nextByteIndex
+
+	// We're looking for a letter to end the CSI sequence
+	for {
+		char := s.nextChar()
+		if char == -1 {
+			return false
+		}
+
+		if char == ';' || (char >= '0' && char <= '9') {
+			// Sequence still in progress
+
+			if s.input[startIndex:s.nextByteIndex] == "8;;" {
+				// Special case, here comes the URL
+				return s.handleUrl()
+			}
+
+			continue
+		}
+
+		// The end, handle what we got
+		endIndexExclusive := s.nextByteIndex
+		return s.handleCompleteControlSequence(charAfterEsc, s.input[startIndex:endIndexExclusive])
+	}
+}
+
+// If the whole CSI sequence is ESC[33m, you should call this function with just
+// "33m".
+func (s *styledStringSplitter) handleCompleteControlSequence(charAfterEsc rune, sequence string) bool {
+	if charAfterEsc != '[' {
+		return false
+	}
+
+	if sequence == "K" || sequence == "0K" {
+		// Clear to end of line
+		s.trailer = s.inProgressStyle
+		return true
+	}
+
+	lastChar := sequence[len(sequence)-1]
+	if lastChar == 'm' {
+		newStyle := rawUpdateStyle(s.inProgressStyle, sequence)
+		s.startNewPart(newStyle)
+		return true
+	}
+
+	return false
+}
+
+// We just got ESC]8; and should now read the URL. URLs end with ASCII 7 BEL or ESC \.
+func (s *styledStringSplitter) handleUrl() bool {
+	// Valid URL characters.
+	// Ref: https://stackoverflow.com/a/1547940/473672
+	const validChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;="
+
+	// Points to right after "ESC]8;"
+	urlStartIndex := s.nextByteIndex
+
+	justSawEsc := false
+	for {
+		char := s.nextChar()
+		if char == -1 {
+			return false
+		}
+
+		if justSawEsc {
+			if char != '\\' {
+				return false
+			}
+
+			// End of URL
+			urlEndIndexExclusive := s.nextByteIndex - 2
+			url := s.input[urlStartIndex:urlEndIndexExclusive]
+			s.startNewPart(s.inProgressStyle.WithHyperlink(&url))
+			return true
+		}
+
+		// Invariant: justSawEsc == false
+
+		if char == esc {
+			justSawEsc = true
+			continue
+		}
+
+		if char == '\x07' {
+			// End of URL
+			urlEndIndexExclusive := s.nextByteIndex - 1
+			url := s.input[urlStartIndex:urlEndIndexExclusive]
+			s.startNewPart(s.inProgressStyle.WithHyperlink(&url))
+			return true
+		}
+
+		if !strings.ContainsRune(validChars, char) {
+			return false
+		}
+
+		// It's a valid URL char, keep going
+	}
+}
+
+func (s *styledStringSplitter) startNewPart(style twin.Style) {
+	if style == s.inProgressStyle {
+		// No need to start a new part
+		return
+	}
+
+	s.finalizeCurrentPart()
+	s.inProgressString.Reset()
+	s.inProgressStyle = style
+}
+
+func (s *styledStringSplitter) finalizeCurrentPart() {
+	if s.inProgressString.Len() == 0 {
+		// Nothing to do
+		return
+	}
+
+	s.parts = append(s.parts, _StyledString{
+		String: s.inProgressString.String(),
+		Style:  s.inProgressStyle,
+	})
+}
--- a/m/styledStringSplitter_test.go
+++ b/m/styledStringSplitter_test.go
@ -0,0 +1,27 @@
+package m
+
+import (
+	"testing"
+
+	"gotest.tools/v3/assert"
+)
+
+func TestNextCharLastChar_base(t *testing.T) {
+	s := styledStringSplitter{
+		input: "a",
+	}
+
+	assert.Equal(t, 'a', s.nextChar())
+	assert.Equal(t, 'a', s.lastChar())
+	assert.Equal(t, rune(-1), s.nextChar())
+	assert.Equal(t, rune(-1), s.lastChar())
+}
+
+func TestNextCharLastChar_empty(t *testing.T) {
+	s := styledStringSplitter{
+		input: "",
+	}
+
+	assert.Equal(t, rune(-1), s.nextChar())
+	assert.Equal(t, rune(-1), s.lastChar())
+}