git-bug/query/lexer.go

package query

import (
	"fmt"
	"strings"
	"unicode"
)

type tokenKind int

const (
	_ tokenKind = iota
	tokenKindKV
	tokenKindKVV
	tokenKindSearch
)

type token struct {
	kind tokenKind

	// KV and KVV
	qualifier string
	value     string

	// KVV only
	subQualifier string

	// Search
	term string
}

func newTokenKV(qualifier, value string) token {
	return token{
		kind:      tokenKindKV,
		qualifier: qualifier,
		value:     value,
	}
}

func newTokenKVV(qualifier, subQualifier, value string) token {
	return token{
		kind:         tokenKindKVV,
		qualifier:    qualifier,
		subQualifier: subQualifier,
		value:        value,
	}
}

func newTokenSearch(term string) token {
	return token{
		kind: tokenKindSearch,
		term: term,
	}
}

// tokenize parse and break a input into tokens ready to be
// interpreted later by a parser to get the semantic.
func tokenize(query string) ([]token, error) {
	fields, err := splitFunc(query, unicode.IsSpace)
	if err != nil {
		return nil, err
	}

	var tokens []token
	for _, field := range fields {
		chunks, err := splitFunc(field, func(r rune) bool { return r == ':' })
		if err != nil {
			return nil, err
		}

		if strings.HasPrefix(field, ":") || strings.HasSuffix(field, ":") {
			return nil, fmt.Errorf("empty qualifier or value")
		}

		// pre-process chunks
		for i, chunk := range chunks {
			if len(chunk) == 0 {
				return nil, fmt.Errorf("empty qualifier or value")
			}
			chunks[i] = removeQuote(chunk)
		}

		switch len(chunks) {
		case 1: // full text search
			tokens = append(tokens, newTokenSearch(chunks[0]))

		case 2: // KV
			tokens = append(tokens, newTokenKV(chunks[0], chunks[1]))

		case 3: // KVV
			tokens = append(tokens, newTokenKVV(chunks[0], chunks[1], chunks[2]))

		default:
			return nil, fmt.Errorf("can't tokenize \"%s\": too many separators", field)
		}
	}
	return tokens, nil
}

func removeQuote(field string) string {
	runes := []rune(field)
	if len(runes) >= 2 {
		r1 := runes[0]
		r2 := runes[len(runes)-1]

		if r1 == r2 && isQuote(r1) {
			return string(runes[1 : len(runes)-1])
		}
	}
	return field
}

// split the input into chunks by splitting according to separatorFunc but respecting
// quotes
func splitFunc(input string, separatorFunc func(r rune) bool) ([]string, error) {
	lastQuote := rune(0)
	inQuote := false

	// return true if it's part of a chunk, or false if it's a rune that delimit one, as determined by the separatorFunc.
	isChunk := func(r rune) bool {
		switch {
		case !inQuote && isQuote(r):
			lastQuote = r
			inQuote = true
			return true
		case inQuote && r == lastQuote:
			lastQuote = rune(0)
			inQuote = false
			return true
		case inQuote:
			return true
		default:
			return !separatorFunc(r)
		}
	}

	var result []string
	var chunk strings.Builder
	for _, r := range input {
		if isChunk(r) {
			chunk.WriteRune(r)
		} else {
			if chunk.Len() > 0 {
				result = append(result, chunk.String())
				chunk.Reset()
			}
		}
	}

	if inQuote {
		return nil, fmt.Errorf("unmatched quote")
	}

	if chunk.Len() > 0 {
		result = append(result, chunk.String())
	}

	return result, nil
}

func isQuote(r rune) bool {
	return r == '"' || r == '\''
}
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`package query`

			`import (`
			`"fmt"`
			`"strings"`
			`"unicode"`
			`)`

query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`type tokenKind int`

			`const (`
			`_ tokenKind = iota`
			`tokenKindKV`
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`tokenKindKVV`
query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`tokenKindSearch`
			`)`

cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`type token struct {`
query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`kind tokenKind`

Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`// KV and KVV`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`qualifier string`
			`value string`
query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`// KVV only`
			`subQualifier string`

query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`// Search`
			`term string`
			`}`

			`func newTokenKV(qualifier, value string) token {`
			`return token{`
			`kind: tokenKindKV,`
			`qualifier: qualifier,`
			`value: value,`
			`}`
			`}`

Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`func newTokenKVV(qualifier, subQualifier, value string) token {`
			`return token{`
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`kind: tokenKindKVV,`
			`qualifier: qualifier,`
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`subQualifier: subQualifier,`
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`value: value,`
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`}`
			`}`

query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`func newTokenSearch(term string) token {`
			`return token{`
			`kind: tokenKindSearch,`
			`term: term,`
			`}`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`

			`// tokenize parse and break a input into tokens ready to be`
			`// interpreted later by a parser to get the semantic.`
			`func tokenize(query string) ([]token, error) {`
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`fields, err := splitFunc(query, unicode.IsSpace)`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`if err != nil {`
			`return nil, err`
			`}`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00
			`var tokens []token`
			`for _, field := range fields {`
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`chunks, err := splitFunc(field, func(r rune) bool { return r == ':' })`
			`if err != nil {`
			`return nil, err`
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`}`
query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`if strings.HasPrefix(field, ":") \|\| strings.HasSuffix(field, ":") {`
			`return nil, fmt.Errorf("empty qualifier or value")`
query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`}`

query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`// pre-process chunks`
			`for i, chunk := range chunks {`
			`if len(chunk) == 0 {`
			`return nil, fmt.Errorf("empty qualifier or value")`
			`}`
			`chunks[i] = removeQuote(chunk)`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`

query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`switch len(chunks) {`
			`case 1: // full text search`
			`tokens = append(tokens, newTokenSearch(chunks[0]))`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`case 2: // KV`
			`tokens = append(tokens, newTokenKV(chunks[0], chunks[1]))`
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`case 3: // KVV`
			`tokens = append(tokens, newTokenKVV(chunks[0], chunks[1], chunks[2]))`
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`default:`
			`return nil, fmt.Errorf("can't tokenize \"%s\": too many separators", field)`
Add ability to search by arbitrary metadata Example: ~/git/git-bug/git-bug ls --metadata github-url=https://github.com/author/myproject/issues/42 or ~/git/git-bug/git-bug ls metadata:github-url:\"https://github.com/author/myproject/issues/42\" Fixes the cmdline part of <https://github.com/MichaelMure/git-bug/issues/567>. 2021-02-14 18:03:51 +03:00			`}`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`
			`return tokens, nil`
			`}`

query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`func removeQuote(field string) string {`
			`runes := []rune(field)`
			`if len(runes) >= 2 {`
			`r1 := runes[0]`
			`r2 := runes[len(runes)-1]`

			`if r1 == r2 && isQuote(r1) {`
			`return string(runes[1 : len(runes)-1])`
			`}`
			`}`
			`return field`
			`}`

			`// split the input into chunks by splitting according to separatorFunc but respecting`
query: expand the tokenizer/parser to parse arbitrary search terms 2020-08-20 13:00:34 +03:00			`// quotes`
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`func splitFunc(input string, separatorFunc func(r rune) bool) ([]string, error) {`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`lastQuote := rune(0)`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`inQuote := false`

query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`// return true if it's part of a chunk, or false if it's a rune that delimit one, as determined by the separatorFunc.`
			`isChunk := func(r rune) bool {`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`switch {`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`case !inQuote && isQuote(r):`
			`lastQuote = r`
			`inQuote = true`
			`return true`
			`case inQuote && r == lastQuote:`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`lastQuote = rune(0)`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`inQuote = false`
			`return true`
			`case inQuote:`
			`return true`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`default:`
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`return !separatorFunc(r)`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`
			`}`

query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`var result []string`
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`var chunk strings.Builder`
			`for _, r := range input {`
			`if isChunk(r) {`
			`chunk.WriteRune(r)`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`} else {`
query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`if chunk.Len() > 0 {`
			`result = append(result, chunk.String())`
			`chunk.Reset()`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`}`
			`}`
			`}`

			`if inQuote {`
			`return nil, fmt.Errorf("unmatched quote")`
			`}`

query: refactor to reuse the split function for both query and token 2021-02-27 22:31:10 +03:00			`if chunk.Len() > 0 {`
			`result = append(result, chunk.String())`
query: more robust tokenizer 2020-03-28 21:22:27 +03:00			`}`

			`return result, nil`
			`}`

			`func isQuote(r rune) bool {`
			`return r == '"' \|\| r == '\''`
cache: replace the all-in-one query parser by a complete one with AST/lexer/parser 2020-03-14 18:47:38 +03:00			`}`