sq/drivers/json/ingest_test.go
Neil O'Toole cf0908abfa
JSON ingest performance (#365)
* Implement json ingestion stmt caching
2024-01-25 09:21:56 -07:00

401 lines
11 KiB
Go

package json_test
import (
"bytes"
"context"
stdj "encoding/json"
"io"
"os"
"path/filepath"
"strings"
"testing"
"github.com/stretchr/testify/require"
"github.com/neilotoole/sq/drivers/json"
"github.com/neilotoole/sq/libsq/core/kind"
"github.com/neilotoole/sq/testh"
"github.com/neilotoole/sq/testh/sakila"
"github.com/neilotoole/sq/testh/testsrc"
"github.com/neilotoole/sq/testh/tu"
)
func BenchmarkIngestJSONL_Flat(b *testing.B) {
// $ go test -count=10 -benchtime=5s -bench BenchmarkIngestJSONL_Flat > old.bench.txt
// # Make changes
// $ go test -count=10 -benchtime=5s -bench BenchmarkIngestJSONL_Flat > new.bench.txt
// $ benchstat old.bench.text new.bench.txt
// Either fpath (testdata file path) or input should be provided.
testCases := []struct {
name string
fpath string
input string
wantRows int
wantCols []string
wantKinds []kind.Kind
wantErr bool
}{
{
name: "actor",
fpath: "actor.jsonl",
wantRows: sakila.TblActorCount,
wantCols: sakila.TblActorCols(),
wantKinds: sakila.TblActorColKinds(),
},
{
name: "film_actor",
fpath: "film_actor.jsonl",
wantRows: sakila.TblFilmActorCount,
wantCols: sakila.TblFilmActorCols(),
wantKinds: sakila.TblFilmActorColKinds(),
},
{
name: "actor_nested",
fpath: "jsonl_actor_nested.jsonl",
wantRows: 4,
wantCols: []string{"actor_id", "name_first_name", "name_last_name", "last_update"},
wantKinds: []kind.Kind{kind.Int, kind.Text, kind.Text, kind.Datetime},
},
{
name: "recs_medium",
input: `{"a": 1, "b": 1, "c": true, "d": "2020-06-11", "e": 2.0}
{"a": 1.0, "b": 1, "c": false, "d": "2020-06-12", "e":2.01}`,
wantRows: 2,
wantCols: []string{"a", "b", "c", "d", "e"},
wantKinds: []kind.Kind{kind.Int, kind.Int, kind.Bool, kind.Date, kind.Float},
},
{
name: "recs_small",
input: `{"b": 1}
{"a": 1.1, "b": 2}`,
wantRows: 2,
wantCols: []string{"b", "a"},
wantKinds: []kind.Kind{kind.Int, kind.Float},
},
{
name: "recs_null",
input: `{"a": null, "b": null}
{"a": 1.1, "b": 2.0000}`,
wantRows: 2,
wantCols: []string{"a", "b"},
wantKinds: []kind.Kind{kind.Float, kind.Int},
},
}
for _, tc := range testCases {
tc := tc
newRdrFn := func(_ context.Context) (io.ReadCloser, error) { //nolint:unparam
return io.NopCloser(strings.NewReader(tc.input)), nil
}
if tc.fpath != "" {
newRdrFn = func(_ context.Context) (io.ReadCloser, error) {
return os.Open(filepath.Join("testdata", tc.fpath))
}
}
b.Run(tc.name, func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
th := testh.New(b, testh.OptNoLog())
src := th.Source(testsrc.EmptyDB)
grip := th.Open(src)
job := json.NewIngestJob(src, newRdrFn, grip, 0, true)
b.StartTimer()
err := json.IngestJSONL(th.Context, job)
if tc.wantErr {
require.Error(b, err)
return
}
}
})
}
}
func TestIngestJSONL_Flat(t *testing.T) {
t.Parallel()
// Either fpath (testdata file path) or input should be provided.
testCases := []struct {
name string
fpath string
input string
wantRows int
wantCols []string
wantKinds []kind.Kind
wantErr bool
}{
{
name: "actor",
fpath: "actor.jsonl",
wantRows: sakila.TblActorCount,
wantCols: sakila.TblActorCols(),
wantKinds: sakila.TblActorColKinds(),
},
{
name: "film_actor",
fpath: "film_actor.jsonl",
wantRows: sakila.TblFilmActorCount,
wantCols: sakila.TblFilmActorCols(),
wantKinds: sakila.TblFilmActorColKinds(),
},
{
name: "actor_nested",
fpath: "jsonl_actor_nested.jsonl",
wantRows: 4,
wantCols: []string{"actor_id", "name_first_name", "name_last_name", "last_update"},
wantKinds: []kind.Kind{kind.Int, kind.Text, kind.Text, kind.Datetime},
},
{
name: "recs_medium",
input: `{"a": 1, "b": 1, "c": true, "d": "2020-06-11", "e": 2.0}
{"a": 1.0, "b": 1, "c": false, "d": "2020-06-12", "e":2.01}`,
wantRows: 2,
wantCols: []string{"a", "b", "c", "d", "e"},
wantKinds: []kind.Kind{kind.Int, kind.Int, kind.Bool, kind.Date, kind.Float},
},
{
name: "recs_small",
input: `{"b": 1}
{"a": 1.1, "b": 2}`,
wantRows: 2,
wantCols: []string{"b", "a"},
wantKinds: []kind.Kind{kind.Int, kind.Float},
},
{
name: "recs_null",
input: `{"a": null, "b": null}
{"a": 1.1, "b": 2.0000}`,
wantRows: 2,
wantCols: []string{"a", "b"},
wantKinds: []kind.Kind{kind.Float, kind.Int},
},
}
for _, tc := range testCases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
newRdrFn := func(ctx context.Context) (io.ReadCloser, error) {
return io.NopCloser(strings.NewReader(tc.input)), nil
}
if tc.fpath != "" {
newRdrFn = func(ctx context.Context) (io.ReadCloser, error) {
return os.Open(filepath.Join("testdata", tc.fpath))
}
}
th, src, _, grip, _ := testh.NewWith(t, testsrc.EmptyDB)
job := json.NewIngestJob(src, newRdrFn, grip, 0, true)
err := json.IngestJSONL(th.Context, job)
if tc.wantErr {
require.Error(t, err)
return
}
require.NoError(t, err)
sink, err := th.QuerySQL(src, nil, "SELECT * FROM data")
require.NoError(t, err)
require.Equal(t, tc.wantRows, len(sink.Recs))
require.Equal(t, tc.wantCols, sink.RecMeta.Names())
require.Equal(t, tc.wantCols, sink.RecMeta.MungedNames())
require.Equal(t, tc.wantKinds, sink.RecMeta.Kinds())
})
}
}
func TestIngestJSON_Flat(t *testing.T) {
t.Parallel()
newRdrFn := func(context.Context) (io.ReadCloser, error) {
return os.Open("testdata/actor.json")
}
th, src, _, grip, _ := testh.NewWith(t, testsrc.EmptyDB)
job := json.NewIngestJob(src, newRdrFn, grip, 0, true)
err := json.IngestJSON(th.Context, job)
require.NoError(t, err)
sink, err := th.QuerySQL(src, nil, "SELECT * FROM data")
require.NoError(t, err)
require.Equal(t, sakila.TblActorCount, len(sink.Recs))
}
func TestScanObjectsInArray(t *testing.T) {
t.Parallel()
var (
m1 = []map[string]any{{"a": float64(1)}}
m2 = []map[string]any{{"a": float64(1)}, {"a": float64(2)}}
m3 = []map[string]any{{"a": float64(1)}, {"a": float64(2)}, {"a": float64(3)}}
m4 = []map[string]any{
{
"a": float64(1), "b": []any{float64(1), float64(2), float64(3)}, "c": map[string]any{"c1": float64(1)},
"d": "d1",
},
{
"a": float64(2), "b": []any{float64(21), float64(22), float64(23)}, "c": map[string]any{"c1": float64(2)},
"d": "d2",
},
}
)
testCases := []struct {
in string
wantObjs []map[string]any
wantChunks []string
wantErr bool
}{
{in: ``, wantErr: true},
{in: `[,]`, wantErr: true},
{in: `[],`, wantErr: true},
{in: `[] ,`, wantErr: true},
{in: `,[]`, wantErr: true},
{in: `[]`},
{in: ` []`},
{in: ` [ ] `},
{in: `[{"a":1}]`, wantObjs: m1, wantChunks: []string{`{"a":1}`}},
{in: `{[{"a":1}]}`, wantErr: true},
{in: `[,{"a":1}]`, wantErr: true},
{in: `[{"a":1},]`, wantErr: true},
{in: ` [{"a":1}]`, wantObjs: m1, wantChunks: []string{`{"a":1}`}},
{in: `[ {"a":1} ]`, wantObjs: m1, wantChunks: []string{`{"a":1}`}},
{in: `[ { "a" : 1 } ]`, wantObjs: m1, wantChunks: []string{`{ "a" : 1 }`}},
{in: `[{"a":1},{"a":2}]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
{in: `[,{"a":1},{"a":2}]`, wantErr: true},
{in: `[{"a":1},,{"a":2}]`, wantErr: true},
{in: ` [{"a":1},{"a":2}]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
{in: `[{"a":1}, {"a":2}]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
{in: `[{"a":1} ,{"a":2}]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
{in: `[{"a":1} , {"a":2}]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
{in: `[{"a":1} , {"a":2} ]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
{in: `[ {"a":1} , {"a":2} ]`, wantObjs: m2, wantChunks: []string{`{"a":1}`, `{"a":2}`}},
{in: `[ { "a" : 1} , {"a": 2 } ]`, wantObjs: m2, wantChunks: []string{`{ "a" : 1}`, `{"a": 2 }`}},
{in: `[{"a":1},{"a":2},{"a":3}]`, wantObjs: m3, wantChunks: []string{`{"a":1}`, `{"a":2}`, `{"a":3}`}},
{in: `[{"a":1} ,{"a":2},{"a":3}]`, wantObjs: m3, wantChunks: []string{`{"a":1}`, `{"a":2}`, `{"a":3}`}},
{
in: "[\n {\"a\" : 1},\n {\"a\" : 2 \n}\n,\n {\"a\": 3}\n]\n\n", wantObjs: m3,
wantChunks: []string{"{\"a\" : 1}", "{\"a\" : 2 \n}", "{\"a\": 3}"},
},
{
in: `[{"a":1,"b":[1,2,3],"c":{"c1":1},"d":"d1"} , {"a":2,"b":[21,22,23],"c":{"c1":2},"d":"d2"}]`,
wantObjs: m4, wantChunks: []string{
`{"a":1,"b":[1,2,3],"c":{"c1":1},"d":"d1"}`,
`{"a":2,"b":[21,22,23],"c":{"c1":2},"d":"d2"}`,
},
},
}
for i, tc := range testCases {
tc := tc
t.Run(tu.Name(i, tc.in), func(t *testing.T) {
t.Parallel()
r := bytes.NewReader([]byte(tc.in))
gotObjs, gotChunks, err := json.ScanObjectsInArray(r)
if tc.wantErr {
require.Error(t, err)
return
}
require.NoError(t, err)
require.EqualValues(t, tc.wantObjs, gotObjs)
require.Equal(t, len(tc.wantObjs), len(gotChunks))
require.Equal(t, len(tc.wantChunks), len(gotChunks))
for j := range tc.wantChunks {
require.Equal(t, tc.wantChunks[j], string(gotChunks[j]))
}
})
}
}
func TestScanObjectsInArray_Files(t *testing.T) {
t.Parallel()
testCases := []struct {
fname string
wantCount int
}{
{fname: "testdata/actor.json", wantCount: sakila.TblActorCount},
{fname: "testdata/film_actor.json", wantCount: sakila.TblFilmActorCount},
{fname: "testdata/payment.json", wantCount: sakila.TblPaymentCount},
}
for _, tc := range testCases {
tc := tc
t.Run(tu.Name(tc.fname), func(t *testing.T) {
t.Parallel()
f, err := os.Open(tc.fname)
require.NoError(t, err)
defer f.Close()
gotObjs, gotChunks, err := json.ScanObjectsInArray(f)
require.NoError(t, err)
require.Equal(t, tc.wantCount, len(gotObjs))
require.Equal(t, tc.wantCount, len(gotChunks))
})
}
}
func TestColumnOrderFlat(t *testing.T) {
t.Parallel()
testCases := []struct {
in string
want []string
wantErr bool
}{
{in: `{}`, want: nil},
{in: `{"a":1}`, want: []string{"a"}},
{in: `{"a":1, "b": {"c":2}}`, want: []string{"a", "b_c"}},
{in: `{"a":1, "b": {"c":2}, "d":3}`, want: []string{"a", "b_c", "d"}},
{in: `{"a":1, "b": {"c":2, "d":3}}`, want: []string{"a", "b_c", "b_d"}},
{in: `{"a":1, "b": {"c":2}, "d":3, "e":4}`, want: []string{"a", "b_c", "d", "e"}},
{in: `{"a":1, "b": {"c":2}, "d": [3,4], "e":5}`, want: []string{"a", "b_c", "d", "e"}},
{in: `{"d": [3,4], "e":5}`, want: []string{"d", "e"}},
{in: `{"d": [3], "e":5}`, want: []string{"d", "e"}},
{in: `{"d": [3,[4,5]], "e":6}`, want: []string{"d", "e"}},
{in: `{"d": [3,[4,5,[6,7,8]]], "e":9, "fname":[10,11,[12,13]]}`, want: []string{"d", "e", "fname"}},
{in: `{"a":1, "b": {"c":2}, "d": 3, "e":4}`, want: []string{"a", "b_c", "d", "e"}},
{in: `{"b":1,"a":2}`, want: []string{"b", "a"}},
{
in: `{"a":1,"b":2,"c":{"c1":3,"c2":4,"c3":{"d1":5,"d2":6},"c5":7},"e":8}`,
want: []string{"a", "b", "c_c1", "c_c2", "c_c3_d1", "c_c3_d2", "c_c5", "e"},
},
}
for i, tc := range testCases {
tc := tc
t.Run(tu.Name(i, tc.in), func(t *testing.T) {
t.Parallel()
require.True(t, stdj.Valid([]byte(tc.in)))
gotCols, err := json.ColumnOrderFlat([]byte(tc.in))
if tc.wantErr {
require.Error(t, err)
return
}
require.NoError(t, err)
require.Equal(t, tc.want, gotCols)
})
}
}