mirror of
https://github.com/neilotoole/sq.git
synced 2024-12-18 13:41:49 +03:00
cf0908abfa
* Implement json ingestion stmt caching
401 lines
11 KiB
Go
401 lines
11 KiB
Go
package json_test
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
stdj "encoding/json"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
|
|
"github.com/neilotoole/sq/drivers/json"
|
|
"github.com/neilotoole/sq/libsq/core/kind"
|
|
"github.com/neilotoole/sq/testh"
|
|
"github.com/neilotoole/sq/testh/sakila"
|
|
"github.com/neilotoole/sq/testh/testsrc"
|
|
"github.com/neilotoole/sq/testh/tu"
|
|
)
|
|
|
|
func BenchmarkIngestJSONL_Flat(b *testing.B) {
|
|
// $ go test -count=10 -benchtime=5s -bench BenchmarkIngestJSONL_Flat > old.bench.txt
|
|
// # Make changes
|
|
// $ go test -count=10 -benchtime=5s -bench BenchmarkIngestJSONL_Flat > new.bench.txt
|
|
// $ benchstat old.bench.text new.bench.txt
|
|
|
|
// Either fpath (testdata file path) or input should be provided.
|
|
testCases := []struct {
|
|
name string
|
|
fpath string
|
|
input string
|
|
wantRows int
|
|
wantCols []string
|
|
wantKinds []kind.Kind
|
|
wantErr bool
|
|
}{
|
|
{
|
|
name: "actor",
|
|
fpath: "actor.jsonl",
|
|
wantRows: sakila.TblActorCount,
|
|
wantCols: sakila.TblActorCols(),
|
|
wantKinds: sakila.TblActorColKinds(),
|
|
},
|
|
{
|
|
name: "film_actor",
|
|
fpath: "film_actor.jsonl",
|
|
wantRows: sakila.TblFilmActorCount,
|
|
wantCols: sakila.TblFilmActorCols(),
|
|
wantKinds: sakila.TblFilmActorColKinds(),
|
|
},
|
|
{
|
|
name: "actor_nested",
|
|
fpath: "jsonl_actor_nested.jsonl",
|
|
wantRows: 4,
|
|
wantCols: []string{"actor_id", "name_first_name", "name_last_name", "last_update"},
|
|
wantKinds: []kind.Kind{kind.Int, kind.Text, kind.Text, kind.Datetime},
|
|
},
|
|
{
|
|
name: "recs_medium",
|
|
input: `{"a": 1, "b": 1, "c": true, "d": "2020-06-11", "e": 2.0}
|
|
{"a": 1.0, "b": 1, "c": false, "d": "2020-06-12", "e":2.01}`,
|
|
wantRows: 2,
|
|
wantCols: []string{"a", "b", "c", "d", "e"},
|
|
wantKinds: []kind.Kind{kind.Int, kind.Int, kind.Bool, kind.Date, kind.Float},
|
|
},
|
|
{
|
|
name: "recs_small",
|
|
input: `{"b": 1}
|
|
{"a": 1.1, "b": 2}`,
|
|
wantRows: 2,
|
|
wantCols: []string{"b", "a"},
|
|
wantKinds: []kind.Kind{kind.Int, kind.Float},
|
|
},
|
|
{
|
|
name: "recs_null",
|
|
input: `{"a": null, "b": null}
|
|
{"a": 1.1, "b": 2.0000}`,
|
|
wantRows: 2,
|
|
wantCols: []string{"a", "b"},
|
|
wantKinds: []kind.Kind{kind.Float, kind.Int},
|
|
},
|
|
}
|
|
|
|
for _, tc := range testCases {
|
|
tc := tc
|
|
|
|
newRdrFn := func(_ context.Context) (io.ReadCloser, error) { //nolint:unparam
|
|
return io.NopCloser(strings.NewReader(tc.input)), nil
|
|
}
|
|
|
|
if tc.fpath != "" {
|
|
newRdrFn = func(_ context.Context) (io.ReadCloser, error) {
|
|
return os.Open(filepath.Join("testdata", tc.fpath))
|
|
}
|
|
}
|
|
|
|
b.Run(tc.name, func(b *testing.B) {
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
|
|
for i := 0; i < b.N; i++ {
|
|
b.StopTimer()
|
|
th := testh.New(b, testh.OptNoLog())
|
|
src := th.Source(testsrc.EmptyDB)
|
|
grip := th.Open(src)
|
|
|
|
job := json.NewIngestJob(src, newRdrFn, grip, 0, true)
|
|
|
|
b.StartTimer()
|
|
err := json.IngestJSONL(th.Context, job)
|
|
if tc.wantErr {
|
|
require.Error(b, err)
|
|
return
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestIngestJSONL_Flat(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Either fpath (testdata file path) or input should be provided.
|
|
testCases := []struct {
|
|
name string
|
|
fpath string
|
|
input string
|
|
wantRows int
|
|
wantCols []string
|
|
wantKinds []kind.Kind
|
|
wantErr bool
|
|
}{
|
|
{
|
|
name: "actor",
|
|
fpath: "actor.jsonl",
|
|
wantRows: sakila.TblActorCount,
|
|
wantCols: sakila.TblActorCols(),
|
|
wantKinds: sakila.TblActorColKinds(),
|
|
},
|
|
{
|
|
name: "film_actor",
|
|
fpath: "film_actor.jsonl",
|
|
wantRows: sakila.TblFilmActorCount,
|
|
wantCols: sakila.TblFilmActorCols(),
|
|
wantKinds: sakila.TblFilmActorColKinds(),
|
|
},
|
|
{
|
|
name: "actor_nested",
|
|
fpath: "jsonl_actor_nested.jsonl",
|
|
wantRows: 4,
|
|
wantCols: []string{"actor_id", "name_first_name", "name_last_name", "last_update"},
|
|
wantKinds: []kind.Kind{kind.Int, kind.Text, kind.Text, kind.Datetime},
|
|
},
|
|
{
|
|
name: "recs_medium",
|
|
input: `{"a": 1, "b": 1, "c": true, "d": "2020-06-11", "e": 2.0}
|
|
{"a": 1.0, "b": 1, "c": false, "d": "2020-06-12", "e":2.01}`,
|
|
wantRows: 2,
|
|
wantCols: []string{"a", "b", "c", "d", "e"},
|
|
wantKinds: []kind.Kind{kind.Int, kind.Int, kind.Bool, kind.Date, kind.Float},
|
|
},
|
|
{
|
|
name: "recs_small",
|
|
input: `{"b": 1}
|
|
{"a": 1.1, "b": 2}`,
|
|
wantRows: 2,
|
|
wantCols: []string{"b", "a"},
|
|
wantKinds: []kind.Kind{kind.Int, kind.Float},
|
|
},
|
|
{
|
|
name: "recs_null",
|
|
input: `{"a": null, "b": null}
|
|
{"a": 1.1, "b": 2.0000}`,
|
|
wantRows: 2,
|
|
wantCols: []string{"a", "b"},
|
|
wantKinds: []kind.Kind{kind.Float, kind.Int},
|
|
},
|
|
}
|
|
|
|
for _, tc := range testCases {
|
|
tc := tc
|
|
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
newRdrFn := func(ctx context.Context) (io.ReadCloser, error) {
|
|
return io.NopCloser(strings.NewReader(tc.input)), nil
|
|
}
|
|
|
|
if tc.fpath != "" {
|
|
newRdrFn = func(ctx context.Context) (io.ReadCloser, error) {
|
|
return os.Open(filepath.Join("testdata", tc.fpath))
|
|
}
|
|
}
|
|
|
|
th, src, _, grip, _ := testh.NewWith(t, testsrc.EmptyDB)
|
|
job := json.NewIngestJob(src, newRdrFn, grip, 0, true)
|
|
|
|
err := json.IngestJSONL(th.Context, job)
|
|
if tc.wantErr {
|
|
require.Error(t, err)
|
|
return
|
|
}
|
|
|
|
require.NoError(t, err)
|
|
sink, err := th.QuerySQL(src, nil, "SELECT * FROM data")
|
|
require.NoError(t, err)
|
|
require.Equal(t, tc.wantRows, len(sink.Recs))
|
|
require.Equal(t, tc.wantCols, sink.RecMeta.Names())
|
|
require.Equal(t, tc.wantCols, sink.RecMeta.MungedNames())
|
|
require.Equal(t, tc.wantKinds, sink.RecMeta.Kinds())
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestIngestJSON_Flat(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
newRdrFn := func(context.Context) (io.ReadCloser, error) {
|
|
return os.Open("testdata/actor.json")
|
|
}
|
|
|
|
th, src, _, grip, _ := testh.NewWith(t, testsrc.EmptyDB)
|
|
job := json.NewIngestJob(src, newRdrFn, grip, 0, true)
|
|
|
|
err := json.IngestJSON(th.Context, job)
|
|
require.NoError(t, err)
|
|
|
|
sink, err := th.QuerySQL(src, nil, "SELECT * FROM data")
|
|
require.NoError(t, err)
|
|
require.Equal(t, sakila.TblActorCount, len(sink.Recs))
|
|
}
|
|
|
|
func TestScanObjectsInArray(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var (
|
|
m1 = []map[string]any{{"a": float64(1)}}
|
|
m2 = []map[string]any{{"a": float64(1)}, {"a": float64(2)}}
|
|
m3 = []map[string]any{{"a": float64(1)}, {"a": float64(2)}, {"a": float64(3)}}
|
|
m4 = []map[string]any{
|
|
{
|
|
"a": float64(1), "b": []any{float64(1), float64(2), float64(3)}, "c": map[string]any{"c1": float64(1)},
|
|
"d": "d1",
|
|
},
|
|
{
|
|
"a": float64(2), "b": []any{float64(21), float64(22), float64(23)}, "c": map[string]any{"c1": float64(2)},
|
|
"d": "d2",
|
|
},
|
|
}
|
|
)
|
|
|
|
testCases := []struct {
|
|
in string
|
|
wantObjs []map[string]any
|
|
wantChunks []string
|
|
wantErr bool
|
|
}{
|
|
{in: ``, wantErr: true},
|
|
{in: `[,]`, wantErr: true},
|
|
{in: `[],`, wantErr: true},
|
|
{in: `[] ,`, wantErr: true},
|
|
{in: `,[]`, wantErr: true},
|
|
{in: `[]`},
|
|
{in: ` []`},
|
|
{in: ` [ ] `},
|
|
{in: `[{"a":1}]`, wantObjs: m1, wantChunks: []string{`{"a":1}`}},
|
|
{in: `{[{"a":1}]}`, wantErr: true},
|
|
{in: `[,{"a":1}]`, wantErr: true},
|
|
{in: `[{"a":1},]`, wantErr: true},
|
|
{in: ` [{"a":1}]`, wantObjs: m1, wantChunks: []string{`{"a":1}`}},
|
|
{in: `[ {"a":1} ]`, wantObjs: m1, wantChunks: []string{`{"a":1}`}},
|
|
{in: `[ { "a" : 1 } ]`, wantObjs: m1, wantChunks: []string{`{ "a" : 1 }`}},
|
|
{in: `[{"a":1},{"a":2}]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
|
|
{in: `[,{"a":1},{"a":2}]`, wantErr: true},
|
|
{in: `[{"a":1},,{"a":2}]`, wantErr: true},
|
|
{in: ` [{"a":1},{"a":2}]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
|
|
{in: `[{"a":1}, {"a":2}]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
|
|
{in: `[{"a":1} ,{"a":2}]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
|
|
{in: `[{"a":1} , {"a":2}]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
|
|
{in: `[{"a":1} , {"a":2} ]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
|
|
{in: `[ {"a":1} , {"a":2} ]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
|
|
{in: `[ { "a" : 1} , {"a": 2 } ]`, wantObjs: m2, wantChunks: []string{`{ "a" : 1}`, `{"a": 2 }`}},
|
|
{in: `[{"a":1},{"a":2},{"a":3}]`, wantObjs: m3, wantChunks: []string{`{"a":1}`, `{"a":2}`, `{"a":3}`}},
|
|
{in: `[{"a":1} ,{"a":2},{"a":3}]`, wantObjs: m3, wantChunks: []string{`{"a":1}`, `{"a":2}`, `{"a":3}`}},
|
|
{
|
|
in: "[\n {\"a\" : 1},\n {\"a\" : 2 \n}\n,\n {\"a\": 3}\n]\n\n", wantObjs: m3,
|
|
wantChunks: []string{"{\"a\" : 1}", "{\"a\" : 2 \n}", "{\"a\": 3}"},
|
|
},
|
|
{
|
|
in: `[{"a":1,"b":[1,2,3],"c":{"c1":1},"d":"d1"} , {"a":2,"b":[21,22,23],"c":{"c1":2},"d":"d2"}]`,
|
|
wantObjs: m4, wantChunks: []string{
|
|
`{"a":1,"b":[1,2,3],"c":{"c1":1},"d":"d1"}`,
|
|
`{"a":2,"b":[21,22,23],"c":{"c1":2},"d":"d2"}`,
|
|
},
|
|
},
|
|
}
|
|
|
|
for i, tc := range testCases {
|
|
tc := tc
|
|
|
|
t.Run(tu.Name(i, tc.in), func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
r := bytes.NewReader([]byte(tc.in))
|
|
gotObjs, gotChunks, err := json.ScanObjectsInArray(r)
|
|
if tc.wantErr {
|
|
require.Error(t, err)
|
|
return
|
|
}
|
|
|
|
require.NoError(t, err)
|
|
require.EqualValues(t, tc.wantObjs, gotObjs)
|
|
require.Equal(t, len(tc.wantObjs), len(gotChunks))
|
|
require.Equal(t, len(tc.wantChunks), len(gotChunks))
|
|
for j := range tc.wantChunks {
|
|
require.Equal(t, tc.wantChunks[j], string(gotChunks[j]))
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestScanObjectsInArray_Files(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
testCases := []struct {
|
|
fname string
|
|
wantCount int
|
|
}{
|
|
{fname: "testdata/actor.json", wantCount: sakila.TblActorCount},
|
|
{fname: "testdata/film_actor.json", wantCount: sakila.TblFilmActorCount},
|
|
{fname: "testdata/payment.json", wantCount: sakila.TblPaymentCount},
|
|
}
|
|
|
|
for _, tc := range testCases {
|
|
tc := tc
|
|
|
|
t.Run(tu.Name(tc.fname), func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
f, err := os.Open(tc.fname)
|
|
require.NoError(t, err)
|
|
defer f.Close()
|
|
|
|
gotObjs, gotChunks, err := json.ScanObjectsInArray(f)
|
|
require.NoError(t, err)
|
|
require.Equal(t, tc.wantCount, len(gotObjs))
|
|
require.Equal(t, tc.wantCount, len(gotChunks))
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestColumnOrderFlat(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
testCases := []struct {
|
|
in string
|
|
want []string
|
|
wantErr bool
|
|
}{
|
|
{in: `{}`, want: nil},
|
|
{in: `{"a":1}`, want: []string{"a"}},
|
|
{in: `{"a":1, "b": {"c":2}}`, want: []string{"a", "b_c"}},
|
|
{in: `{"a":1, "b": {"c":2}, "d":3}`, want: []string{"a", "b_c", "d"}},
|
|
{in: `{"a":1, "b": {"c":2, "d":3}}`, want: []string{"a", "b_c", "b_d"}},
|
|
{in: `{"a":1, "b": {"c":2}, "d":3, "e":4}`, want: []string{"a", "b_c", "d", "e"}},
|
|
{in: `{"a":1, "b": {"c":2}, "d": [3,4], "e":5}`, want: []string{"a", "b_c", "d", "e"}},
|
|
{in: `{"d": [3,4], "e":5}`, want: []string{"d", "e"}},
|
|
{in: `{"d": [3], "e":5}`, want: []string{"d", "e"}},
|
|
{in: `{"d": [3,[4,5]], "e":6}`, want: []string{"d", "e"}},
|
|
{in: `{"d": [3,[4,5,[6,7,8]]], "e":9, "fname":[10,11,[12,13]]}`, want: []string{"d", "e", "fname"}},
|
|
{in: `{"a":1, "b": {"c":2}, "d": 3, "e":4}`, want: []string{"a", "b_c", "d", "e"}},
|
|
{in: `{"b":1,"a":2}`, want: []string{"b", "a"}},
|
|
{
|
|
in: `{"a":1,"b":2,"c":{"c1":3,"c2":4,"c3":{"d1":5,"d2":6},"c5":7},"e":8}`,
|
|
want: []string{"a", "b", "c_c1", "c_c2", "c_c3_d1", "c_c3_d2", "c_c5", "e"},
|
|
},
|
|
}
|
|
|
|
for i, tc := range testCases {
|
|
tc := tc
|
|
|
|
t.Run(tu.Name(i, tc.in), func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
require.True(t, stdj.Valid([]byte(tc.in)))
|
|
|
|
gotCols, err := json.ColumnOrderFlat([]byte(tc.in))
|
|
if tc.wantErr {
|
|
require.Error(t, err)
|
|
return
|
|
}
|
|
|
|
require.NoError(t, err)
|
|
require.Equal(t, tc.want, gotCols)
|
|
})
|
|
}
|
|
}
|