interp: Add buffer match support to find and grep

2024-12-23 13:22:58 +03:00 · 2021-10-17 01:26:30 +02:00 · 2021-10-17 01:26:30 +02:00 · 7298a4cd8d
commit 7298a4cd8d
parent 984ba1aa43
7 changed files with 77 additions and 22 deletions
--- a/doc/usage.md
+++ b/doc/usage.md
@ -142,11 +142,16 @@ notable is support for arbitrary-precision integers.
  - `format_root/0` return root value of format for value
  - `parent/0` return parent value
  - `parents/0` output parents of value
  - `find` and `grep` all take 1 or 2 arguments. First is a scalar to match, where a string is
  treated as a regexp. A buffer will be matches exact bytes. Second argument is regexp
  flags with addition to "b" which will treat each byte in the input buffer as a rune, this
  makes it possible to match exact bytes, ex: `find("\u00ff"; b")` will match the byte `0xff` and not
  the UTF-8 codepoint `0xff`.
    - `find/1`, `find/2` match in buffer and output match buffers
    - `grep/1`, `grep/2` recursively match value and buffer
    - `vgrep/1`, `vgrep/2` recursively match value
    - `bgrep/1`, `bgrep/2` recursively match buffer
    - `fgrep/1`, `fgrep/2` recursively match field name
  - `find/1`, `find/2` match in buffer and output match buffers
 - `open` open file for reading
 - `probe` or `decode` probe format and decode
 - `mp3`, `matroska`, ..., `<name>`, `decode([name])` force decode as format
--- a/pkg/bitio/bitio.go
+++ b/pkg/bitio/bitio.go
@ -105,6 +105,7 @@ func Copy(dst BitWriter, src BitReader) (n int64, err error) {
 	return CopyBuffer(dst, src, nil)
 }
 // BitsByteCount returns smallest amount of bytes to fit nBits bits
 func BitsByteCount(nBits int64) int64 {
 	n := nBits / 8
 	if nBits%8 != 0 {
--- a/pkg/interp/funcs.go
+++ b/pkg/interp/funcs.go
@ -817,12 +817,24 @@ func (i *Interp) find(c interface{}, a []interface{}) gojq.Iter {
 	}
 	var re string
-	re, ok = a[0].(string)
+	var flags string
-	if !ok {
+
-		return gojq.NewIter(gojqextra.FuncTypeError{Name: "find", Typ: "string"})
+	switch a0 := a[0].(type) {
 	case string:
 		re = a0
 	default:
 		reBuf, err := toBytes(a0)
 		if err != nil {
 			return gojq.NewIter(err)
 		}
 		var reRs []rune
 		for _, b := range reBuf {
 			reRs = append(reRs, rune(b))
 		}
 		flags = "b"
 		re = string(reRs)
 	}
 	var flags string
 	if len(a) > 1 {
 		flags, ok = a[1].(string)
 		if !ok {
--- a/pkg/interp/grep.jq
+++ b/pkg/interp/grep.jq
@ -15,10 +15,12 @@ def _value_grep_string_cond($v; $flags):
    else false
    end
  )? // false;
 def _value_grep_other_cond($v; $flags):
  ( _tovalue
  | . == $v
  )? // false;
 def vgrep($v; $flags):
  _grep(
    $v;
@ -26,35 +28,40 @@ def vgrep($v; $flags):
    _value_grep_string_cond($v; $flags);
    _value_grep_other_cond($v; $flags)
  );
 def vgrep($v): vgrep($v; "");
-def _buf_grep_string_cond($v; $flags):
+def _buf_grep_any_cond($v; $flags):
  (isempty(find($v; $flags)) | not)? // false;
 def bgrep($v; $flags):
  _grep(
    $v;
    _is_scalar;
-    _buf_grep_string_cond($v; $flags);
+    _buf_grep_any_cond($v; $flags);
-    empty
+    _buf_grep_any_cond($v; $flags)
  );
 def bgrep($v): bgrep($v; "");
 def grep($v; $flags):
  _grep(
    $v;
    _is_scalar;
-    _buf_grep_string_cond($v; $flags) or _value_grep_string_cond($v; $flags);
+    _buf_grep_any_cond($v; $flags) or _value_grep_string_cond($v; $flags);
-    _value_grep_other_cond($v; $flags)
+    _buf_grep_any_cond($v; $flags) or _value_grep_other_cond($v; $flags)
  );
 def grep($v): grep($v; "");
 def _field_grep_string_cond($v; $flags):
-  (has("_name") and (._name | test($v; $flags)))? // false;
+  (._name | test($v; $flags))? // false;
 def fgrep($v; $flags):
  _grep(
    $v;
-    true;
+    _is_decode_value;
    _field_grep_string_cond($v; $flags);
    empty
  );
 def fgrep($v): fgrep($v; "");
--- a/pkg/interp/interp.go
+++ b/pkg/interp/interp.go
@ -267,6 +267,7 @@ func toBigInt(v interface{}) (*big.Int, error) {
 func toBytes(v interface{}) ([]byte, error) {
 	switch v := v.(type) {
 	// TODO: remove?
 	case []byte:
 		return v, nil
 	default:
@ -307,10 +308,15 @@ func toBufferEx(v interface{}, inArray bool) (*bitio.Buffer, error) {
 		}
 		if inArray {
-			b := [1]byte{byte(bi.Uint64())}
+			if bi.Cmp(big.NewInt(255)) > 0 || bi.Cmp(big.NewInt(0)) < 0 {
 				return nil, fmt.Errorf("buffer byte list must be bytes (0-255) got %v", bi)
 			}
 			n := bi.Uint64()
 			b := [1]byte{byte(n)}
 			return bitio.NewBufferFromBytes(b[:], -1), nil
 		}
 		// TODO: how should this work? "0xf | tobytes" 4bits or 8bits? now 4
 		padBefore := (8 - (bi.BitLen() % 8)) % 8
 		bb, err := bitio.NewBufferFromBytes(bi.Bytes(), -1).BitBufRange(int64(padBefore), int64(bi.BitLen()))
 		if err != nil {
@ -319,7 +325,7 @@ func toBufferEx(v interface{}, inArray bool) (*bitio.Buffer, error) {
 		return bb, nil
 	case []interface{}:
 		var rr []bitio.BitReadAtSeeker
-		// TODO: optimize byte array case
+		// TODO: optimize byte array case, flatten into one slice
 		for _, e := range vv {
 			eBB, eErr := toBufferEx(e, true)
 			if eErr != nil {
--- a/pkg/interp/testdata/buffer.fqtest
+++ b/pkg/interp/testdata/buffer.fqtest
@ -40,3 +40,18 @@ $ fq -d mp3 '.frames[0].padding | ("", "md5", "base64", "snippet") as $f | toval
 "ca9c491ac66b2c62500882e93f3719a8"
 "AAAAAAA="
 "<5>AAAAAAA="
 $ fq -d mp3 -i . /test.mp3
 mp3> [1, 2, 3] | tobytes
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|01 02 03|                                      |...|            |.: none 0x0-0x2.7 (3)
 mp3> [1, 2, 3, [1, 2, 3]] | tobytes
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|01 02 03 01 02 03|                             |......|         |.: none 0x0-0x5.7 (6)
 mp3> [1, 2, 3, [1, 2, 3], .headers[0].magic] | tobytes
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|01 02 03 01 02 03 49 44 33|                    |......ID3|      |.: none 0x0-0x8.7 (9)
 mp3> [-1] | tobytes
 error: buffer byte list must be bytes (0-255) got -1
 mp3> [256] | tobytes
 error: buffer byte list must be bytes (0-255) got 256
 mp3> ^D
--- a/pkg/interp/testdata/grep.fqtest
+++ b/pkg/interp/testdata/grep.fqtest
@ -1,5 +1,5 @@
 $ fq -i -d mp3 . /test.mp3
-mp3> grep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff")
+mp3> grep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff", [0x49, 0x44])
    |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x20|                                             40|               @|.frames[0].header.sample_rate: 44100
    |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
@ -14,7 +14,9 @@ mp3> grep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff")
 0x0|49 44 33                                       |ID3             |.headers[0].magic: "ID3" (Correct)
    |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x40|      49 6e 66 6f                              |  Info          |.frames[0].xing.header: "Info"
-mp3> vgrep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff")
+   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|49 44 33                                       |ID3             |.headers[0].magic: "ID3" (Correct)
 mp3> vgrep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff", [0x49, 0x44])
    |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x20|                                             40|               @|.frames[0].header.sample_rate: 44100
    |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
@ -29,10 +31,10 @@ mp3> vgrep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff")
 0x0|49 44 33                                       |ID3             |.headers[0].magic: "ID3" (Correct)
    |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x40|      49 6e 66 6f                              |  Info          |.frames[0].xing.header: "Info"
-mp3> fgrep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff")
+mp3> fgrep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff", [0x49, 0x44])
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|49 44 33                                       |ID3             |.headers[0].magic: "ID3" (Correct)
-mp3> bgrep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff")
+mp3> bgrep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff", [0x49, 0x44])
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|49 44 33                                       |ID3             |.headers[0].magic: "ID3" (Correct)
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
@ -41,6 +43,8 @@ mp3> bgrep(44100, "ID", "^ID3$", "^ID.?$", "Info", "magic", "\u00ff")
 0x0|49 44 33                                       |ID3             |.headers[0].magic: "ID3" (Correct)
    |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x40|      49 6e 66 6f                              |  Info          |.frames[0].xing.header: "Info"
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|49 44 33                                       |ID3             |.headers[0].magic: "ID3" (Correct)
 mp3> "64ff65ff66" | hex | bgrep("\u00ff"; "b")
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|64 ff 65 ff 66|                                |d.e.f|          |.: none 0x0-0x4.7 (5)
@ -59,4 +63,9 @@ mp3> "aöaöa" | find("\u00c3"; "b")
 0x0|   c3 b6 61 c3 b6 61|                          | ..a..a|        |.: none 0x1-0x6.7 (6)
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|            c3 b6 61|                          |    ..a|        |.: none 0x4-0x6.7 (3)
 mp3> "aöaöa" | find([0xc3])
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|   c3 b6 61 c3 b6 61|                          | ..a..a|        |.: none 0x1-0x6.7 (6)
   |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
 0x0|            c3 b6 61|                          |    ..a|        |.: none 0x4-0x6.7 (3)
 mp3> ^D