1
1
mirror of https://github.com/walles/moar.git synced 2024-09-11 12:15:43 +03:00

Merge branch 'johan/functional-ssfs'

This merge splits the styled string splitter into multiple functions for
readability and maintainability.

This makes the BenchmarkHighlightedSearch about 10% slower, but I think
it's worth the tradeoff.
This commit is contained in:
Johan Walles 2023-11-11 15:39:25 +01:00
commit cfdf99d3e3
4 changed files with 343 additions and 258 deletions

View File

@ -424,194 +424,10 @@ type _StyledString struct {
Style twin.Style
}
type parseState int
const (
initial parseState = iota
justSawEsc
inStyle
gotOsc // OSC = Operating System Command = ESC]
gotOsc8 // ESC]8
gotOsc8Semi // ESC]8;
inUrl // After ESC]8;;
inUrlGotEsc // Expecting a \ now to terminate the URL
)
func styledStringsFromString(s string) styledStringsWithTrailer {
if !strings.ContainsAny(s, "\x1b") {
// This shortcut makes BenchmarkPlainTextSearch() perform a lot better
return styledStringsWithTrailer{
trailer: twin.StyleDefault,
styledStrings: []_StyledString{{
String: s,
Style: twin.StyleDefault,
}},
}
}
trailer := twin.StyleDefault
parts := make([]_StyledString, 1)
state := initial
escIndex := -1 // Byte index into s
partStart := 0 // Byte index into s
urlStart := -1 // Byte index into s
style := twin.StyleDefault
for byteIndex, char := range s {
if state == initial {
if char == '\x1b' {
escIndex = byteIndex
state = justSawEsc
}
continue
} else if state == justSawEsc {
if char == '\x1b' {
escIndex = byteIndex
state = justSawEsc
} else if char == '[' {
state = inStyle
} else if char == ']' {
state = gotOsc
} else {
state = initial
}
continue
} else if state == inStyle {
if char == '\x1b' {
escIndex = byteIndex
state = justSawEsc
} else if (char >= '0' && char <= '9') || char == ';' {
// Stay in style
} else if char == 'm' {
if partStart < escIndex {
// Consume the most recent part
parts = append(parts, _StyledString{
String: s[partStart:escIndex],
Style: style,
})
}
style = updateStyle(style, s[escIndex:byteIndex+1])
partStart = byteIndex + 1 // Next part starts after this 'm'
state = initial
} else if char == 'K' {
ansiStyle := s[escIndex : byteIndex+1]
if ansiStyle != "\x1b[K" && ansiStyle != "\x1b[0K" {
// Not a supported clear operation, just treat the whole thing as plain text
state = initial
continue
}
// Handle clear-to-end-of-line
if partStart < escIndex {
// Consume the most recent part
parts = append(parts, _StyledString{
String: s[partStart:escIndex],
Style: style,
})
}
trailer = style
partStart = byteIndex + 1 // Next part starts after this 'K'
state = initial
} else {
// Unsupported sequence, just treat the whole thing as plain text
state = initial
}
continue
} else if state == gotOsc {
if char == '8' {
state = gotOsc8
} else {
state = initial
}
continue
} else if state == gotOsc8 {
if char == ';' {
state = gotOsc8Semi
} else {
state = initial
}
continue
} else if state == gotOsc8Semi {
if char == ';' {
urlStart = byteIndex + 1
state = inUrl
} else {
state = initial
}
continue
} else if state == inUrl {
// Ref: https://stackoverflow.com/a/1547940/473672
const validChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;="
if char == '\x1b' {
state = inUrlGotEsc
} else if char == '\x07' {
// End of URL
if partStart < escIndex {
// Consume the most recent part
parts = append(parts, _StyledString{
String: s[partStart:escIndex],
Style: style,
})
}
partStart = byteIndex + 1
url := s[urlStart:byteIndex]
style = style.WithHyperlink(&url)
state = initial
} else if strings.ContainsRune(validChars, char) {
// Stay in URL
} else {
// Invalid URL character, just treat the whole thing as plain text
state = initial
}
continue
} else if state == inUrlGotEsc {
if char == '\\' {
// End of URL
if partStart < escIndex {
// Consume the most recent part
parts = append(parts, _StyledString{
String: s[partStart:escIndex],
Style: style,
})
}
partStart = byteIndex + 1
url := s[urlStart : byteIndex-1]
style = style.WithHyperlink(&url)
state = initial
} else {
// Broken ending, just treat the whole thing as plain text
state = initial
}
continue
}
panic("We should never get here")
}
if partStart < len(s) {
// Consume the most recent part
parts = append(parts, _StyledString{
String: s[partStart:],
Style: style,
})
}
return styledStringsWithTrailer{
styledStrings: parts,
trailer: trailer,
}
}
// updateStyle parses a string of the form "ESC[33m" into changes to style
func updateStyle(style twin.Style, escapeSequence string) twin.Style {
numbers := strings.Split(escapeSequence[2:len(escapeSequence)-1], ";")
// rawUpdateStyle parses a string of the form "33m" into changes to style. This
// is what comes after ESC[ in an ANSI SGR sequence.
func rawUpdateStyle(style twin.Style, escapeSequenceWithoutHeader string) twin.Style {
numbers := strings.Split(escapeSequenceWithoutHeader[:len(escapeSequenceWithoutHeader)-1], ";")
index := 0
for index < len(numbers) {
number := numbers[index]

View File

@ -28,76 +28,78 @@ func cellsToPlainString(cells []twin.Cell) string {
// without logging any errors
func TestTokenize(t *testing.T) {
for _, fileName := range getTestFiles() {
file, err := os.Open(fileName)
if err != nil {
t.Errorf("Error opening file <%s>: %s", fileName, err.Error())
continue
}
defer func() {
if err := file.Close(); err != nil {
panic(err)
t.Run(fileName, func(t *testing.T) {
file, err := os.Open(fileName)
if err != nil {
t.Errorf("Error opening file <%s>: %s", fileName, err.Error())
return
}
}()
defer func() {
if err := file.Close(); err != nil {
panic(err)
}
}()
myReader := NewReaderFromStream(fileName, file)
for !myReader.done.Load() {
}
for lineNumber := 1; lineNumber <= myReader.GetLineCount(); lineNumber++ {
line := myReader.GetLine(lineNumber)
lineNumber++
var loglines strings.Builder
log.SetOutput(&loglines)
tokens := cellsFromString(line.raw).Cells
plainString := withoutFormatting(line.raw)
if len(tokens) != utf8.RuneCountInString(plainString) {
t.Errorf("%s:%d: len(tokens)=%d, len(plainString)=%d for: <%s>",
fileName, lineNumber,
len(tokens), utf8.RuneCountInString(plainString), line.raw)
continue
myReader := NewReaderFromStream(fileName, file)
for !myReader.done.Load() {
}
// Tokens and plain have the same lengths, compare contents
plainStringChars := []rune(plainString)
for index, plainChar := range plainStringChars {
cellChar := tokens[index]
if cellChar.Rune == plainChar {
for lineNumber := 1; lineNumber <= myReader.GetLineCount(); lineNumber++ {
line := myReader.GetLine(lineNumber)
lineNumber++
var loglines strings.Builder
log.SetOutput(&loglines)
tokens := cellsFromString(line.raw).Cells
plainString := withoutFormatting(line.raw)
if len(tokens) != utf8.RuneCountInString(plainString) {
t.Errorf("%s:%d: len(tokens)=%d, len(plainString)=%d for: <%s>",
fileName, lineNumber,
len(tokens), utf8.RuneCountInString(plainString), line.raw)
continue
}
if cellChar.Rune == '•' && plainChar == 'o' {
// Pretty bullets on man pages
// Tokens and plain have the same lengths, compare contents
plainStringChars := []rune(plainString)
for index, plainChar := range plainStringChars {
cellChar := tokens[index]
if cellChar.Rune == plainChar {
continue
}
if cellChar.Rune == '•' && plainChar == 'o' {
// Pretty bullets on man pages
continue
}
// Chars mismatch!
plainStringFromCells := cellsToPlainString(tokens)
positionMarker := strings.Repeat(" ", index) + "^"
cellCharString := string(cellChar.Rune)
if !twin.Printable(cellChar.Rune) {
cellCharString = fmt.Sprint(int(cellChar.Rune))
}
plainCharString := string(plainChar)
if !twin.Printable(plainChar) {
plainCharString = fmt.Sprint(int(plainChar))
}
t.Errorf("%s:%d, 0-based column %d: cell char <%s> != plain char <%s>:\nPlain: %s\nCells: %s\n %s",
fileName, lineNumber, index,
cellCharString, plainCharString,
plainString,
plainStringFromCells,
positionMarker,
)
break
}
if len(loglines.String()) != 0 {
t.Errorf("%s: %s", fileName, loglines.String())
continue
}
// Chars mismatch!
plainStringFromCells := cellsToPlainString(tokens)
positionMarker := strings.Repeat(" ", index) + "^"
cellCharString := string(cellChar.Rune)
if !twin.Printable(cellChar.Rune) {
cellCharString = fmt.Sprint(int(cellChar.Rune))
}
plainCharString := string(plainChar)
if !twin.Printable(plainChar) {
plainCharString = fmt.Sprint(int(plainChar))
}
t.Errorf("%s:%d, 0-based column %d: cell char <%s> != plain char <%s>:\nPlain: %s\nCells: %s\n %s",
fileName, lineNumber, index,
cellCharString, plainCharString,
plainString,
plainStringFromCells,
positionMarker,
)
break
}
if len(loglines.String()) != 0 {
t.Errorf("%s: %s", fileName, loglines.String())
continue
}
}
})
}
}
@ -229,8 +231,8 @@ func TestConsumeCompositeColorIncomplete24Bit(t *testing.T) {
assert.Assert(t, color == nil)
}
func TestUpdateStyle(t *testing.T) {
numberColored := updateStyle(twin.StyleDefault, "\x1b[33m")
func TestRawUpdateStyle(t *testing.T) {
numberColored := rawUpdateStyle(twin.StyleDefault, "33m")
assert.Equal(t, numberColored, twin.StyleDefault.Foreground(twin.NewColor16(3)))
}
@ -287,15 +289,18 @@ func TestHyperlink_incomplete(t *testing.T) {
complete := "a\x1b]8;;X\x1b\\"
for l := len(complete) - 1; l >= 0; l-- {
tokens := cellsFromString(complete[:l]).Cells
incomplete := complete[:l]
t.Run(fmt.Sprintf("l=%d incomplete=<%s>", l, strings.ReplaceAll(incomplete, "\x1b", "ESC")), func(t *testing.T) {
tokens := cellsFromString(incomplete).Cells
for i := 0; i < l; i++ {
if complete[i] == '\x1b' {
// These get special rendering, if everything else matches
// that's good enough.
continue
for i := 0; i < l; i++ {
if complete[i] == '\x1b' {
// These get special rendering, if everything else matches
// that's good enough.
continue
}
assert.Equal(t, tokens[i], twin.Cell{Rune: rune(complete[i]), Style: twin.StyleDefault})
}
assert.Equal(t, tokens[i], twin.Cell{Rune: rune(complete[i]), Style: twin.StyleDefault})
}
})
}
}

237
m/styledStringSplitter.go Normal file
View File

@ -0,0 +1,237 @@
package m
import (
"strings"
"unicode/utf8"
"github.com/walles/moar/twin"
)
const esc = '\x1b'
type styledStringSplitter struct {
input string
nextByteIndex int
previousByteIndex int
inProgressString strings.Builder
inProgressStyle twin.Style
parts []_StyledString
trailer twin.Style
}
func styledStringsFromString(s string) styledStringsWithTrailer {
if !strings.ContainsAny(s, "\x1b") {
// This shortcut makes BenchmarkPlainTextSearch() perform a lot better
return styledStringsWithTrailer{
trailer: twin.StyleDefault,
styledStrings: []_StyledString{{
String: s,
Style: twin.StyleDefault,
}},
}
}
splitter := styledStringSplitter{
input: s,
}
splitter.run()
return styledStringsWithTrailer{
trailer: splitter.trailer,
styledStrings: splitter.parts,
}
}
func (s *styledStringSplitter) nextChar() rune {
if s.nextByteIndex >= len(s.input) {
s.previousByteIndex = s.nextByteIndex
return -1
}
char, size := utf8.DecodeRuneInString(s.input[s.nextByteIndex:])
s.previousByteIndex = s.nextByteIndex
s.nextByteIndex += size
return char
}
// Returns whatever the last call to nextChar() returned
func (s *styledStringSplitter) lastChar() rune {
if s.previousByteIndex >= len(s.input) {
return -1
}
char, _ := utf8.DecodeRuneInString(s.input[s.previousByteIndex:])
return char
}
func (s *styledStringSplitter) run() {
char := s.nextChar()
for {
if char == -1 {
s.finalizeCurrentPart()
return
}
if char == esc {
escIndex := s.previousByteIndex
success := s.handleEscape()
if !success {
// Somewhere in handleEscape(), we got a character that was
// unexpected. We need to treat everything up to before that
// character as just plain runes.
for _, char := range s.input[escIndex:s.previousByteIndex] {
s.handleRune(char)
}
// Start over with the character that caused the problem
char = s.lastChar()
continue
}
} else {
s.handleRune(char)
}
char = s.nextChar()
}
}
func (s *styledStringSplitter) handleRune(char rune) {
s.inProgressString.WriteRune(char)
}
func (s *styledStringSplitter) handleEscape() bool {
char := s.nextChar()
if char == '[' || char == ']' {
// Got the start of a CSI or an OSC sequence
return s.consumeControlSequence(char)
}
return false
}
func (s *styledStringSplitter) consumeControlSequence(charAfterEsc rune) bool {
// Points to right after "ESC["
startIndex := s.nextByteIndex
// We're looking for a letter to end the CSI sequence
for {
char := s.nextChar()
if char == -1 {
return false
}
if char == ';' || (char >= '0' && char <= '9') {
// Sequence still in progress
if s.input[startIndex:s.nextByteIndex] == "8;;" {
// Special case, here comes the URL
return s.handleUrl()
}
continue
}
// The end, handle what we got
endIndexExclusive := s.nextByteIndex
return s.handleCompleteControlSequence(charAfterEsc, s.input[startIndex:endIndexExclusive])
}
}
// If the whole CSI sequence is ESC[33m, you should call this function with just
// "33m".
func (s *styledStringSplitter) handleCompleteControlSequence(charAfterEsc rune, sequence string) bool {
if charAfterEsc != '[' {
return false
}
if sequence == "K" || sequence == "0K" {
// Clear to end of line
s.trailer = s.inProgressStyle
return true
}
lastChar := sequence[len(sequence)-1]
if lastChar == 'm' {
newStyle := rawUpdateStyle(s.inProgressStyle, sequence)
s.startNewPart(newStyle)
return true
}
return false
}
// We just got ESC]8; and should now read the URL. URLs end with ASCII 7 BEL or ESC \.
func (s *styledStringSplitter) handleUrl() bool {
// Valid URL characters.
// Ref: https://stackoverflow.com/a/1547940/473672
const validChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;="
// Points to right after "ESC]8;"
urlStartIndex := s.nextByteIndex
justSawEsc := false
for {
char := s.nextChar()
if char == -1 {
return false
}
if justSawEsc {
if char != '\\' {
return false
}
// End of URL
urlEndIndexExclusive := s.nextByteIndex - 2
url := s.input[urlStartIndex:urlEndIndexExclusive]
s.startNewPart(s.inProgressStyle.WithHyperlink(&url))
return true
}
// Invariant: justSawEsc == false
if char == esc {
justSawEsc = true
continue
}
if char == '\x07' {
// End of URL
urlEndIndexExclusive := s.nextByteIndex - 1
url := s.input[urlStartIndex:urlEndIndexExclusive]
s.startNewPart(s.inProgressStyle.WithHyperlink(&url))
return true
}
if !strings.ContainsRune(validChars, char) {
return false
}
// It's a valid URL char, keep going
}
}
func (s *styledStringSplitter) startNewPart(style twin.Style) {
if style == s.inProgressStyle {
// No need to start a new part
return
}
s.finalizeCurrentPart()
s.inProgressString.Reset()
s.inProgressStyle = style
}
func (s *styledStringSplitter) finalizeCurrentPart() {
if s.inProgressString.Len() == 0 {
// Nothing to do
return
}
s.parts = append(s.parts, _StyledString{
String: s.inProgressString.String(),
Style: s.inProgressStyle,
})
}

View File

@ -0,0 +1,27 @@
package m
import (
"testing"
"gotest.tools/v3/assert"
)
func TestNextCharLastChar_base(t *testing.T) {
s := styledStringSplitter{
input: "a",
}
assert.Equal(t, 'a', s.nextChar())
assert.Equal(t, 'a', s.lastChar())
assert.Equal(t, rune(-1), s.nextChar())
assert.Equal(t, rune(-1), s.lastChar())
}
func TestNextCharLastChar_empty(t *testing.T) {
s := styledStringSplitter{
input: "",
}
assert.Equal(t, rune(-1), s.nextChar())
assert.Equal(t, rune(-1), s.lastChar())
}