Port new shlex code to Go

This commit is contained in:
Kovid Goyal 2023-12-04 14:14:11 +05:30
parent 04eafbea9b
commit a1f2a7df4d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 199 additions and 434 deletions

View File

@ -28,6 +28,9 @@ func TestParseSSHArgs(t *testing.T) {
if err != nil {
t.Fatal(err)
}
if len(ans) == 0 {
ans = []string{}
}
return ans
}
@ -39,7 +42,7 @@ func TestParseSSHArgs(t *testing.T) {
check := func(a, b any) {
diff := cmp.Diff(a, b)
if diff != "" {
t.Fatalf("Unexpected value for args: %s\n%s", args, diff)
t.Fatalf("Unexpected value for args: %#v\n%s", args, diff)
}
}
check(split(expected_ssh_args), ssh_args)

View File

@ -109,7 +109,6 @@ next_word(Shlex *self, PyObject *args UNUSED) {
switch(ch) {
case STRING_WITHOUT_ESCAPES_DELIM:
set_state(self, WORD);
if (self->buf_pos && self->state == NORMAL) return get_word(self);
break;
default: write_ch(self, ch); break;
} break;
@ -117,13 +116,9 @@ next_word(Shlex *self, PyObject *args UNUSED) {
switch(ch) {
case STRING_WITH_ESCAPES_DELIM:
set_state(self, WORD);
if (self->buf_pos && self->state == NORMAL) return get_word(self);
break;
case ESCAPE_CHAR:
if (self->src_pos < self->src_sz) {
Py_UCS4 nch = PyUnicode_READ(self->kind, self->src_data, self->src_pos); self->src_pos++;
write_ch(self, nch);
}
write_escape_ch(self);
break;
default: write_ch(self, ch); break;
} break;

View File

@ -12,419 +12,204 @@ To process a stream of strings:
for ; token, err := l.Next(); err != nil {
// process token
}
To access the raw token stream (which includes tokens for spaces):
t := NewTokenizer(os.Stdin)
for ; token, err := t.Next(); err != nil {
// process token
}
*/
package shlex
// Based on https://pkg.go.dev/github.com/google/shlex with many improvements
// Relicensed to GPLv3 since all my additions.changes are GPLv3 which makes the
// original work with was APL2 also GPLv3
import (
"errors"
"fmt"
"io"
"strings"
"unicode/utf8"
)
// TokenType is a top-level token classification: A word, space, unknown.
type TokenType int
// runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape.
type runeTokenClass int
// the internal state used by the lexer state machine
type lexerState int
// Token is a (type, value) pair representing a lexographical token.
type Token struct {
Type TokenType
Value string
Pos int64
type Word struct {
Value string // The word is empty if EOF is reached
Pos int // The position in the input string of the word or the trailer
Err error // Indicates an error (unterminated string or trailing unescaped backslash)
Trailer string // Extra trailing data such as an unterminated string or an unescaped backslash. Present only if Err != nil
}
// Named classes of UTF-8 runes
const (
spaceRunes = " \t\r\n"
escapingQuoteRunes = `"`
nonEscapingQuoteRunes = "'"
escapeRunes = `\`
)
// Classes of rune token
const (
unknownRuneClass runeTokenClass = iota
spaceRuneClass
escapingQuoteRuneClass
nonEscapingQuoteRuneClass
escapeRuneClass
eofRuneClass
)
// Classes of lexographic token
const (
UnknownToken TokenType = iota
WordToken
SpaceToken
)
func (t TokenType) String() string {
switch t {
default:
return "UnknownToken"
case WordToken:
return "WordToken"
case SpaceToken:
return "SpaceToken"
}
}
type lexer_state int
// Lexer state machine states
const (
startState lexerState = iota // no runes have been seen
inWordState // processing regular runes in a word
inSpaceState // processing runes in a space
escapingState // we have just consumed an escape rune; the next rune is literal
escapingQuotedState // we have just consumed an escape rune within a quoted string
quotingEscapingState // we are within a quoted string that supports escaping ("...")
quotingState // we are within a string that does not support escaping ('...')
lex_normal lexer_state = iota
word
string_without_escapes
string_with_escapes
)
// tokenClassifier is used for classifying rune characters.
type tokenClassifier map[rune]runeTokenClass
func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) {
for _, runeChar := range runes {
typeMap[runeChar] = tokenType
}
}
// newDefaultClassifier creates a new classifier for ASCII characters.
func newDefaultClassifier() tokenClassifier {
t := tokenClassifier{}
t.addRuneClass(spaceRunes, spaceRuneClass)
t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass)
t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass)
t.addRuneClass(escapeRunes, escapeRuneClass)
return t
}
// ClassifyRune classifiees a rune
func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass {
return t[runeVal]
}
// Lexer turns an input stream into a sequence of tokens. Whitespace is skipped.
type Lexer Tokenizer
// NewLexer creates a new lexer from an input stream.
func NewLexer(x io.RuneReader) *Lexer {
return (*Lexer)(NewTokenizer(x))
type Lexer struct {
state lexer_state
src string
src_sz, src_pos, word_start int
buf strings.Builder
}
// Next returns the next word, or an error. If there are no more words,
// the error will be io.EOF.
func (l *Lexer) Next() (string, error) {
for {
token, err := (*Tokenizer)(l).Next()
if err != nil {
return "", err
}
switch token.Type {
case WordToken:
return token.Value, nil
case SpaceToken:
// skip spaces
default:
return "", fmt.Errorf("Unknown token type: %s", token.Type)
}
}
// NewLexer creates a new lexer from an input string.
func NewLexer(x string) *Lexer {
return &Lexer{src: x, src_sz: len(x)}
}
// Tokenizer turns an input stream into a sequence of typed tokens
type Tokenizer struct {
input io.RuneReader
classifier tokenClassifier
pos int64
redo_rune struct {
char rune
sz int
rune_type runeTokenClass
}
func (self *Lexer) start_word() {
self.buf.Reset()
self.word_start = self.src_pos - 1
}
// NewTokenizer creates a new tokenizer from an input stream.
func NewTokenizer(input io.RuneReader) *Tokenizer {
classifier := newDefaultClassifier()
return &Tokenizer{
input: input,
classifier: classifier}
func (self *Lexer) get_word() Word {
return Word{Pos: self.word_start, Value: self.buf.String()}
}
var ErrTrailingEscape error = errors.New("EOF found after escape character")
var ErrTrailingQuoteEscape error = errors.New("EOF found after escape character for double quote")
var ErrUnclosedDoubleQuote error = errors.New("EOF found when expecting closing double quote")
var ErrUnclosedSingleQuote error = errors.New("EOF found when expecting closing single quote")
func (self *Lexer) write_ch(ch byte) {
self.buf.WriteByte(ch)
}
// scanStream scans the stream for the next token using the internal state machine.
// It will panic if it encounters a rune which it does not know how to handle.
func (t *Tokenizer) scanStream() (*Token, error) {
state := startState
var tokenType TokenType
var nextRune rune
var nextRuneType runeTokenClass
var err error
var sz int
value := strings.Builder{}
pos_at_start := t.pos
unread_rune := func() {
t.redo_rune.sz = sz
t.redo_rune.char = nextRune
t.redo_rune.rune_type = nextRuneType
t.pos -= int64(sz)
}
token := func() *Token {
return &Token{tokenType, value.String(), pos_at_start}
}
for {
if t.redo_rune.sz > 0 {
nextRune, sz = t.redo_rune.char, t.redo_rune.sz
nextRuneType = t.redo_rune.rune_type
t.redo_rune.sz = 0
} else {
nextRune, sz, err = t.input.ReadRune()
nextRuneType = t.classifier.ClassifyRune(nextRune)
func (self *Lexer) write_escaped_ch() bool {
ch, count := utf8.DecodeRuneInString(self.src[self.src_pos:])
if count > 0 {
self.src_pos += count
if ch != utf8.RuneError {
self.buf.WriteRune(ch)
}
return true
}
return false
}
if err == io.EOF {
nextRuneType = eofRuneClass
err = nil
} else if err != nil {
return nil, err
}
t.pos += int64(sz)
switch state {
case startState: // no runes read yet
{
switch nextRuneType {
case eofRuneClass:
{
return nil, io.EOF
}
case spaceRuneClass:
{
tokenType = SpaceToken
value.WriteRune(nextRune)
state = inSpaceState
}
case escapingQuoteRuneClass:
{
tokenType = WordToken
state = quotingEscapingState
}
case nonEscapingQuoteRuneClass:
{
tokenType = WordToken
state = quotingState
}
case escapeRuneClass:
{
tokenType = WordToken
state = escapingState
}
default:
{
tokenType = WordToken
value.WriteRune(nextRune)
state = inWordState
}
// Next returns the next word. At EOF Word.Value will be ""
func (self *Lexer) Next() (ans Word) {
const string_with_escapes_delim = '"'
const string_without_escapes_delim = '\''
const escape_char = '\\'
for self.src_pos < self.src_sz {
ch := self.src[self.src_pos]
self.src_pos++
switch self.state {
case lex_normal:
switch ch {
case ' ', '\n', '\r', '\t':
case string_with_escapes_delim:
self.state = string_with_escapes
self.start_word()
case string_without_escapes_delim:
self.state = string_without_escapes
self.start_word()
case escape_char:
self.start_word()
if !self.write_escaped_ch() {
ans.Trailer = "\\"
ans.Err = fmt.Errorf("Extra backslash at end of input")
ans.Pos = self.word_start
return
}
self.state = word
default:
self.state = word
self.start_word()
self.write_ch(ch)
}
case inSpaceState: // in a sequence of spaces separating words
{
switch nextRuneType {
case spaceRuneClass:
{
value.WriteRune(nextRune)
}
default:
{
unread_rune()
return token(), err
}
case word:
switch ch {
case ' ', '\n', '\r', '\t':
self.state = lex_normal
if self.buf.Len() > 0 {
return self.get_word()
}
}
case inWordState: // in a regular word
{
switch nextRuneType {
case eofRuneClass:
{
return token(), err
}
case spaceRuneClass:
{
unread_rune()
return token(), err
}
case escapingQuoteRuneClass:
{
state = quotingEscapingState
}
case nonEscapingQuoteRuneClass:
{
state = quotingState
}
case escapeRuneClass:
{
state = escapingState
}
default:
{
value.WriteRune(nextRune)
}
case string_with_escapes_delim:
self.state = string_with_escapes
case string_without_escapes_delim:
self.state = string_without_escapes
case escape_char:
if !self.write_escaped_ch() {
ans.Pos = self.word_start
ans.Trailer = self.buf.String() + "\\"
ans.Err = fmt.Errorf("Extra backslash at end of input")
return
}
default:
self.write_ch(ch)
}
case escapingState: // the rune after an escape character
{
switch nextRuneType {
case eofRuneClass:
{
err = ErrTrailingEscape
return token(), err
}
default:
{
state = inWordState
value.WriteRune(nextRune)
}
}
case string_without_escapes:
switch ch {
case string_without_escapes_delim:
self.state = word
default:
self.write_ch(ch)
}
case escapingQuotedState: // the next rune after an escape character, in double quotes
{
switch nextRuneType {
case eofRuneClass:
{
err = ErrTrailingQuoteEscape
return token(), err
}
default:
{
state = quotingEscapingState
value.WriteRune(nextRune)
}
}
}
case quotingEscapingState: // in escaping double quotes
{
switch nextRuneType {
case eofRuneClass:
{
err = ErrUnclosedDoubleQuote
return token(), err
}
case escapingQuoteRuneClass:
{
state = inWordState
}
case escapeRuneClass:
{
state = escapingQuotedState
}
default:
{
value.WriteRune(nextRune)
}
}
}
case quotingState: // in non-escaping single quotes
{
switch nextRuneType {
case eofRuneClass:
{
err = ErrUnclosedSingleQuote
return token(), err
}
case nonEscapingQuoteRuneClass:
{
state = inWordState
}
default:
{
value.WriteRune(nextRune)
}
}
}
default:
{
return nil, fmt.Errorf("Unexpected state: %v", state)
case string_with_escapes:
switch ch {
case string_with_escapes_delim:
self.state = word
case escape_char:
self.write_escaped_ch()
default:
self.write_ch(ch)
}
}
}
}
switch self.state {
case word:
self.state = lex_normal
if self.buf.Len() > 0 {
return self.get_word()
}
case string_with_escapes, string_without_escapes:
self.state = lex_normal
ans.Trailer = self.buf.String()
ans.Pos = self.word_start
ans.Err = fmt.Errorf("Unterminated string at end of input")
return
case lex_normal:
// Next returns the next token in the stream.
func (t *Tokenizer) Next() (*Token, error) {
return t.scanStream()
}
// Pos returns the current position in the string as a byte offset
func (t *Tokenizer) Pos() int64 {
return t.pos
}
return
}
// Split partitions a string into a slice of strings.
func Split(s string) ([]string, error) {
l := NewLexer(strings.NewReader(s))
subStrings := make([]string, 0)
func Split(s string) (ans []string, err error) {
l := NewLexer(s)
var word Word
for {
word, err := l.Next()
if err != nil {
if err == io.EOF {
return subStrings, nil
}
return subStrings, err
word = l.Next()
if word.Err != nil {
return ans, word.Err
}
subStrings = append(subStrings, word)
if word.Value == "" {
break
}
ans = append(ans, word.Value)
}
return
}
// SplitForCompletion partitions a string into a slice of strings. It differs from Split in being
// more relaxed about errors and also adding an empty string at the end if s ends with a SpaceToken.
// more relaxed about errors and also adding an empty string at the end if s ends with a Space.
func SplitForCompletion(s string) (argv []string, position_of_last_arg int) {
t := NewTokenizer(strings.NewReader(s))
t := NewLexer(s)
argv = make([]string, 0, len(s)/4)
token := &Token{}
for {
ntoken, err := t.Next()
if err == io.EOF {
if token.Type == SpaceToken {
argv = append(argv, "")
token.Pos += int64(len(token.Value))
word := t.Next()
if word.Value == "" {
if word.Trailer == "" {
trimmed := strings.TrimRight(s, " ")
if len(trimmed) < len(s) { // trailing spaces
pos := position_of_last_arg
if len(argv) > 0 {
pos += len(argv[len(argv)-1])
}
if pos < len(s) { // trailing whitespace
argv = append(argv, "")
position_of_last_arg += len(s) - pos + 1
}
}
} else {
argv = append(argv, word.Trailer)
position_of_last_arg = word.Pos
}
return argv, int(token.Pos)
break
}
if ntoken == nil {
return []string{}, -1
}
switch ntoken.Type {
case WordToken:
argv = append(argv, ntoken.Value)
case SpaceToken:
// skip spaces
default:
return []string{}, -1
}
token = ntoken
position_of_last_arg = word.Pos
argv = append(argv, word.Value)
}
return
}

View File

@ -1,7 +1,6 @@
package shlex
import (
"strings"
"testing"
"github.com/google/go-cmp/cmp"
@ -13,78 +12,24 @@ var (
testString = "one two \"three four\" \"five \\\"six\\\"\" seven#eight # nine # ten eleven 'twelve\\' thirteen=13 fourteen/14"
)
func TestClassifier(t *testing.T) {
classifier := newDefaultClassifier()
tests := map[rune]runeTokenClass{
' ': spaceRuneClass,
'"': escapingQuoteRuneClass,
'\'': nonEscapingQuoteRuneClass}
for runeChar, want := range tests {
got := classifier.ClassifyRune(runeChar)
if got != want {
t.Errorf("ClassifyRune(%v) -> %v. Want: %v", runeChar, got, want)
}
}
}
func TestTokenizer(t *testing.T) {
testInput := testString
expectedTokens := []*Token{
{WordToken, "one", 0},
{SpaceToken, " ", 3},
{WordToken, "two", 4},
{SpaceToken, " ", 7},
{WordToken, "three four", 8},
{SpaceToken, " ", 20},
{WordToken, "five \"six\"", 21},
{SpaceToken, " ", 35},
{WordToken, "seven#eight", 36},
{SpaceToken, " ", 47},
{WordToken, "#", 48},
{SpaceToken, " ", 49},
{WordToken, "nine", 50},
{SpaceToken, " ", 54},
{WordToken, "#", 55},
{SpaceToken, " ", 56},
{WordToken, "ten", 57},
{SpaceToken, " ", 60},
{WordToken, "eleven", 61},
{SpaceToken, " ", 67},
{WordToken, "twelve\\", 68},
{SpaceToken, " ", 77},
{WordToken, "thirteen=13", 78},
{SpaceToken, " ", 89},
{WordToken, "fourteen/14", 90},
}
tokenizer := NewTokenizer(strings.NewReader(testInput))
for i, want := range expectedTokens {
got, err := tokenizer.Next()
if err != nil {
t.Error(err)
}
if diff := cmp.Diff(want, got); diff != "" {
t.Fatalf("Tokenizer.Next()[%v] of: %s:\n%s", i, testString, diff)
}
}
}
func TestLexer(t *testing.T) {
testInput := testString
expectedStrings := []string{"one", "two", "three four", "five \"six\"", "seven#eight", "#", "nine", "#", "ten", "eleven", "twelve\\", "thirteen=13", "fourteen/14"}
lexer := NewLexer(strings.NewReader(testInput))
lexer := NewLexer(testInput)
for i, want := range expectedStrings {
got, err := lexer.Next()
if err != nil {
t.Error(err)
}
if got != want {
got := lexer.Next()
if got.Value != want {
t.Errorf("Lexer.Next()[%v] of %q -> %v. Want: %v", i, testString, got, want)
}
}
}
type Tok struct {
Pos int
Val string
}
func TestSplit(t *testing.T) {
want := []string{"one", "two", "three four", "five \"six\"", "seven#eight", "#", "nine", "#", "ten", "eleven", "twelve\\", "thirteen=13", "fourteen/14"}
got, err := Split(testString)
@ -99,6 +44,43 @@ func TestSplit(t *testing.T) {
t.Errorf("Split(%q)[%v] -> %v. Want: %v", testString, i, got[i], want[i])
}
}
for _, x := range []string{
`abc\`, `\`, `'abc`, `'`, `"`, `asd\`,
} {
_, err := Split(x)
if err == nil {
t.Fatalf("Failed to get an error for: %#v", x)
}
}
s := func(q string) (ans []Tok) {
l := NewLexer(q)
for {
w := l.Next()
if w.Err != nil {
t.Fatal(w.Err)
}
if w.Value == "" {
break
}
ans = append(ans, Tok{w.Pos, w.Value})
}
return
}
for q, expected := range map[string][]Tok{
`"ab"`: {{0, "ab"}},
`x "ab"y \m`: {{0, `x`}, {2, `aby`}, {8, `m`}},
`x'y"\z'1`: {{0, `xy"\z1`}},
`\abc\ d`: {{0, `abc d`}},
``: nil,
` `: nil,
" \tabc\n\t\r ": {{2, "abc"}},
} {
if diff := cmp.Diff(expected, s(q)); diff != "" {
t.Fatalf("Failed for string: %#v\n%s", q, diff)
}
}
}
func TestSplitForCompletion(t *testing.T) {
@ -108,7 +90,7 @@ func TestSplitForCompletion(t *testing.T) {
t.Fatalf("Failed to split: %s\n%s", cmdline, diff)
}
if last_arg_pos != actual_pos {
t.Fatalf("Failed to split: %s\n Last arg pos: %d != %d", cmdline, last_arg_pos, actual_pos)
t.Fatalf("Failed to split: %#v\n Last arg pos: %d != %d", cmdline, last_arg_pos, actual_pos)
}
}
test("a b", 2, "a", "b")