1
1
mirror of https://github.com/wader/fq.git synced 2024-11-23 18:56:52 +03:00
fq/pkg/interp/match.go
Mattias Wadman f7c7801bbe interp: Rework buffer regex support
Now all regexp functions should work similar to string version for buffers.
Added scan_toend that works as scan but outputs buffer from match to end of buffer,
useful when trying to decode a format and you dont know or care to limit.
2021-12-17 16:23:44 +01:00

166 lines
3.3 KiB
Go

package interp
import (
"io"
"regexp"
"strings"
"github.com/wader/fq/internal/gojqextra"
"github.com/wader/fq/internal/ioextra"
"github.com/wader/fq/pkg/ranges"
"github.com/wader/gojq"
)
func init() {
functionRegisterFns = append(functionRegisterFns, func(i *Interp) []Function {
return []Function{
{"_match_buffer", 1, 2, nil, i._bufferMatch},
}
})
}
func (i *Interp) _bufferMatch(c interface{}, a []interface{}) gojq.Iter {
var ok bool
bv, err := toBuffer(c)
if err != nil {
return gojq.NewIter(err)
}
var re string
var byteRunes bool
var global bool
switch a0 := a[0].(type) {
case string:
re = a0
default:
reBuf, err := toBytes(a0)
if err != nil {
return gojq.NewIter(err)
}
var reRs []rune
for _, b := range reBuf {
reRs = append(reRs, rune(b))
}
byteRunes = true
// escape paratheses runes etc
re = regexp.QuoteMeta(string(reRs))
}
var flags string
if len(a) > 1 {
flags, ok = a[1].(string)
if !ok {
return gojq.NewIter(gojqextra.FuncTypeNameError{Name: "find", Typ: "string"})
}
}
if strings.Contains(flags, "b") {
byteRunes = true
}
global = strings.Contains(flags, "g")
// TODO: err to string
// TODO: extract to regexpextra? "all" FindReaderSubmatchIndex that can iter?
sre, err := gojqextra.CompileRegexp(re, "gimb", flags)
if err != nil {
return gojq.NewIter(err)
}
sreNames := sre.SubexpNames()
bb, err := bv.toBuffer()
if err != nil {
return gojq.NewIter(err)
}
var rr interface {
io.RuneReader
io.Seeker
}
// raw bytes regexp matching is a bit tricky, what we do is to read each byte as a codepoint (ByteRuneReader)
// and then we can use UTF-8 encoded codepoint to match a raw byte. So for example \u00ff (encoded as 0xc3 0xbf)
// will match the byte \0xff
if byteRunes {
// byte mode, read each byte as a rune
rr = ioextra.ByteRuneReader{RS: bb}
} else {
rr = ioextra.RuneReadSeeker{RS: bb}
}
var off int64
prevOff := int64(-1)
return iterFn(func() (interface{}, bool) {
// TODO: correct way to handle empty match for buffer, move one byte forward?
// > "asdasd" | [match(""; "g")], [(tobytes | match(""; "g"))] | length
// 7
// 1
if prevOff == off {
return nil, false
}
if prevOff != -1 && !global {
return nil, false
}
_, err = rr.Seek(off, io.SeekStart)
if err != nil {
return err, false
}
l := sre.FindReaderSubmatchIndex(rr)
if l == nil {
return nil, false
}
var captures []interface{}
var firstCapture map[string]interface{}
for i := 0; i < len(l)/2; i++ {
start, end := l[i*2], l[i*2+1]
capture := map[string]interface{}{
"offset": int(off) + start,
"length": end - start,
}
if start != -1 {
matchBitOff := (off + int64(start)) * 8
matchLength := int64(end-start) * 8
bbo := Buffer{
bb: bv.bb,
r: ranges.Range{
Start: bv.r.Start + matchBitOff,
Len: matchLength,
},
unit: 8,
}
capture["string"] = bbo
} else {
capture["string"] = nil
}
if i > 0 {
if sreNames[i] != "" {
capture["name"] = sreNames[i]
} else {
capture["name"] = nil
}
}
if i == 0 {
firstCapture = capture
}
captures = append(captures, capture)
}
prevOff = off
off = off + int64(l[1])
firstCapture["captures"] = captures[1:]
return firstCapture, true
})
}