git-bug/query/lexer.go

package query

import (
	"fmt"
	"strings"
	"unicode"
)

type tokenKind int

const (
	_ tokenKind = iota
	tokenKindKV
	tokenKindKVV
	tokenKindSearch
)

type token struct {
	kind tokenKind

	// KV and KVV
	qualifier string
	value     string

	// KVV only
	subQualifier string

	// Search
	term string
}

func newTokenKV(qualifier, value string) token {
	return token{
		kind:      tokenKindKV,
		qualifier: qualifier,
		value:     value,
	}
}

func newTokenKVV(qualifier, subQualifier, value string) token {
	return token{
		kind:      tokenKindKVV,
		qualifier: qualifier,
		subQualifier: subQualifier,
		value:     value,
	}
}

func newTokenSearch(term string) token {
	return token{
		kind: tokenKindSearch,
		term: term,
	}
}

// tokenize parse and break a input into tokens ready to be
// interpreted later by a parser to get the semantic.
func tokenize(query string) ([]token, error) {
	fields, err := splitQuery(query)
	if err != nil {
		return nil, err
	}

	var tokens []token
	for _, field := range fields {
		// Split using ':' as separator, but separators inside '"' don't count.
		quoted := false
		split := strings.FieldsFunc(field, func(r rune) bool {
			if r == '"' {
				quoted = !quoted
			}
			return !quoted && r == ':'
		})
		if (strings.HasPrefix(field, ":")) {
			split = append([]string{""}, split...)
		}
		if (strings.HasSuffix(field, ":")) {
			split = append(split, "")
		}
		if (quoted) {
			return nil, fmt.Errorf("can't tokenize \"%s\": unmatched quote", field)
		}

		// full text search
		if len(split) == 1 {
			tokens = append(tokens, newTokenSearch(removeQuote(field)))
			continue
		}

		if len(split) > 3 {
			return nil, fmt.Errorf("can't tokenize \"%s\": too many separators", field)
		}

		if len(split[0]) == 0 {
			return nil, fmt.Errorf("can't tokenize \"%s\": empty qualifier", field)
		}

		if len(split) == 2 {
			if len(split[1]) == 0 {
				return nil, fmt.Errorf("empty value for qualifier \"%s\"", split[0])
			}

			tokens = append(tokens, newTokenKV(split[0], removeQuote(split[1])))
		} else {
			if len(split[1]) == 0 {
				return nil, fmt.Errorf("empty sub-qualifier for qualifier \"%s\"", split[0])
			}

			if len(split[2]) == 0 {
				return nil, fmt.Errorf("empty value for qualifier \"%s:%s\"", split[0], split[1])
			}

			tokens = append(tokens, newTokenKVV(split[0], removeQuote(split[1]), removeQuote(split[2])))
		}
	}
	return tokens, nil
}

// split the query into chunks by splitting on whitespaces but respecting
// quotes
func splitQuery(query string) ([]string, error) {
	lastQuote := rune(0)
	inQuote := false

	isToken := func(r rune) bool {
		switch {
		case !inQuote && isQuote(r):
			lastQuote = r
			inQuote = true
			return true
		case inQuote && r == lastQuote:
			lastQuote = rune(0)
			inQuote = false
			return true
		case inQuote:
			return true
		default:
			return !unicode.IsSpace(r)
		}
	}

	var result []string
	var token strings.Builder
	for _, r := range query {
		if isToken(r) {
			token.WriteRune(r)
		} else {
			if token.Len() > 0 {
				result = append(result, token.String())
				token.Reset()
			}
		}
	}

	if inQuote {
		return nil, fmt.Errorf("unmatched quote")
	}

	if token.Len() > 0 {
		result = append(result, token.String())
	}

	return result, nil
}

func isQuote(r rune) bool {
	return r == '"' || r == '\''
}

func removeQuote(field string) string {
	runes := []rune(field)
	if len(runes) >= 2 {
		r1 := runes[0]
		r2 := runes[len(runes)-1]

		if r1 == r2 && isQuote(r1) {
			return string(runes[1 : len(runes)-1])
		}
	}
	return field
}
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`package query`

			`import (`
			`"fmt"`
			`"strings"`
			`"unicode"`
			`)`

query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`type tokenKind int`

			`const (`
			`_ tokenKind = iota`
			`tokenKindKV`
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`tokenKindKVV`
query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`tokenKindSearch`
			`)`

cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`type token struct {`
query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`kind tokenKind`

Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`// KV and KVV`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`qualifier string`
			`value string`
query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`// KVV only`
			`subQualifier string`

query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`// Search`
			`term string`
			`}`

			`func newTokenKV(qualifier, value string) token {`
			`return token{`
			`kind: tokenKindKV,`
			`qualifier: qualifier,`
			`value: value,`
			`}`
			`}`

Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`func newTokenKVV(qualifier, subQualifier, value string) token {`
			`return token{`
			`kind: tokenKindKVV,`
			`qualifier: qualifier,`
			`subQualifier: subQualifier,`
			`value: value,`
			`}`
			`}`

query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`func newTokenSearch(term string) token {`
			`return token{`
			`kind: tokenKindSearch,`
			`term: term,`
			`}`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`

			`// tokenize parse and break a input into tokens ready to be`
			`// interpreted later by a parser to get the semantic.`
			`func tokenize(query string) ([]token, error) {`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`fields, err := splitQuery(query)`
			`if err != nil {`
			`return nil, err`
			`}`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00
			`var tokens []token`
			`for _, field := range fields {`
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`// Split using ':' as separator, but separators inside '"' don't count.`
			`quoted := false`
			`split := strings.FieldsFunc(field, func(r rune) bool {`
			`if r == '"' {`
			`quoted = !quoted`
			`}`
			`return !quoted && r == ':'`
			`})`
			`if (strings.HasPrefix(field, ":")) {`
			`split = append([]string{""}, split...)`
			`}`
			`if (strings.HasSuffix(field, ":")) {`
			`split = append(split, "")`
			`}`
			`if (quoted) {`
			`return nil, fmt.Errorf("can't tokenize \"%s\": unmatched quote", field)`
			`}`
query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00
			`// full text search`
			`if len(split) == 1 {`
			`tokens = append(tokens, newTokenSearch(removeQuote(field)))`
			`continue`
			`}`

Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`if len(split) > 3 {`
			`return nil, fmt.Errorf("can't tokenize \"%s\": too many separators", field)`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`

			`if len(split[0]) == 0 {`
			`return nil, fmt.Errorf("can't tokenize \"%s\": empty qualifier", field)`
			`}`

Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`if len(split) == 2 {`
			`if len(split[1]) == 0 {`
			`return nil, fmt.Errorf("empty value for qualifier \"%s\"", split[0])`
			`}`

			`tokens = append(tokens, newTokenKV(split[0], removeQuote(split[1])))`
			`} else {`
			`if len(split[1]) == 0 {`
			`return nil, fmt.Errorf("empty sub-qualifier for qualifier \"%s\"", split[0])`
			`}`

			`if len(split[2]) == 0 {`
			`return nil, fmt.Errorf("empty value for qualifier \"%s:%s\"", split[0], split[1])`
			`}`

			`tokens = append(tokens, newTokenKVV(split[0], removeQuote(split[1]), removeQuote(split[2])))`
			`}`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`
			`return tokens, nil`
			`}`

query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`// split the query into chunks by splitting on whitespaces but respecting`
			`// quotes`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`func splitQuery(query string) ([]string, error) {`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`lastQuote := rune(0)`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`inQuote := false`

			`isToken := func(r rune) bool {`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`switch {`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`case !inQuote && isQuote(r):`
			`lastQuote = r`
			`inQuote = true`
			`return true`
			`case inQuote && r == lastQuote:`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`lastQuote = rune(0)`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`inQuote = false`
			`return true`
			`case inQuote:`
			`return true`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`default:`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`return !unicode.IsSpace(r)`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`
			`}`

query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`var result []string`
			`var token strings.Builder`
			`for _, r := range query {`
			`if isToken(r) {`
			`token.WriteRune(r)`
			`} else {`
			`if token.Len() > 0 {`
			`result = append(result, token.String())`
			`token.Reset()`
			`}`
			`}`
			`}`

			`if inQuote {`
			`return nil, fmt.Errorf("unmatched quote")`
			`}`

			`if token.Len() > 0 {`
			`result = append(result, token.String())`
			`}`

			`return result, nil`
			`}`

			`func isQuote(r rune) bool {`
			`return r == '"' \|\| r == '\''`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`

			`func removeQuote(field string) string {`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`runes := []rune(field)`
			`if len(runes) >= 2 {`
			`r1 := runes[0]`
			`r2 := runes[len(runes)-1]`

			`if r1 == r2 && isQuote(r1) {`
			`return string(runes[1 : len(runes)-1])`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`
			`}`
			`return field`
			`}`