2020-10-20 18:05:43 +03:00
|
|
|
package json
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
|
|
|
"context"
|
|
|
|
stdj "encoding/json"
|
|
|
|
"io"
|
|
|
|
|
2023-11-20 04:06:36 +03:00
|
|
|
"github.com/neilotoole/sq/libsq/core/errz"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/lg"
|
2023-04-02 22:49:45 +03:00
|
|
|
"github.com/neilotoole/sq/libsq/core/lg/lga"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/lg/lgm"
|
2020-10-20 18:05:43 +03:00
|
|
|
"github.com/neilotoole/sq/libsq/source"
|
2023-11-21 00:42:38 +03:00
|
|
|
"github.com/neilotoole/sq/libsq/source/drivertype"
|
2020-10-20 18:05:43 +03:00
|
|
|
)
|
|
|
|
|
2023-05-03 15:36:10 +03:00
|
|
|
// DetectJSONL returns a source.DriverDetectFunc that can
|
|
|
|
// detect JSONL.
|
|
|
|
func DetectJSONL(sampleSize int) source.DriverDetectFunc {
|
2023-11-21 00:42:38 +03:00
|
|
|
return func(ctx context.Context, openFn source.FileOpenFunc) (detected drivertype.Type,
|
2023-05-03 15:36:10 +03:00
|
|
|
score float32, err error,
|
|
|
|
) {
|
|
|
|
log := lg.FromContext(ctx)
|
|
|
|
var r io.ReadCloser
|
2024-01-15 04:45:34 +03:00
|
|
|
r, err = openFn(ctx)
|
2023-05-03 15:36:10 +03:00
|
|
|
if err != nil {
|
2023-11-21 00:42:38 +03:00
|
|
|
return drivertype.None, 0, errz.Err(err)
|
2020-10-20 18:05:43 +03:00
|
|
|
}
|
2023-05-03 15:36:10 +03:00
|
|
|
defer lg.WarnIfCloseError(log, lgm.CloseFileReader, r)
|
|
|
|
|
|
|
|
sc := bufio.NewScanner(r)
|
|
|
|
var validLines int
|
|
|
|
var line []byte
|
|
|
|
|
|
|
|
for sc.Scan() {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
2023-11-21 00:42:38 +03:00
|
|
|
return drivertype.None, 0, ctx.Err()
|
2023-05-03 15:36:10 +03:00
|
|
|
default:
|
|
|
|
}
|
|
|
|
|
|
|
|
if err = sc.Err(); err != nil {
|
2023-11-21 00:42:38 +03:00
|
|
|
return drivertype.None, 0, errz.Err(err)
|
2023-05-03 15:36:10 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
line = sc.Bytes()
|
|
|
|
line = bytes.TrimSpace(line)
|
|
|
|
if len(line) == 0 {
|
|
|
|
// Probably want to skip blank lines? Maybe
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Each line of JSONL must be braced
|
|
|
|
if line[0] != '{' || line[len(line)-1] != '}' {
|
2023-11-21 00:42:38 +03:00
|
|
|
return drivertype.None, 0, nil
|
2023-05-03 15:36:10 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// If the line is JSONL, it should marshall into map[string]any
|
|
|
|
var vals map[string]any
|
|
|
|
err = stdj.Unmarshal(line, &vals)
|
|
|
|
if err != nil {
|
2023-11-21 00:42:38 +03:00
|
|
|
return drivertype.None, 0, nil
|
2023-05-03 15:36:10 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
validLines++
|
|
|
|
if validLines >= sampleSize {
|
|
|
|
break
|
|
|
|
}
|
2020-10-20 18:05:43 +03:00
|
|
|
}
|
|
|
|
|
2023-05-03 15:36:10 +03:00
|
|
|
if err = sc.Err(); err != nil {
|
2023-11-21 00:42:38 +03:00
|
|
|
return drivertype.None, 0, errz.Err(err)
|
2020-10-20 18:05:43 +03:00
|
|
|
}
|
|
|
|
|
2023-05-03 15:36:10 +03:00
|
|
|
if validLines > 0 {
|
|
|
|
return TypeJSONL, 1.0, nil
|
2020-10-20 18:05:43 +03:00
|
|
|
}
|
|
|
|
|
2023-11-21 00:42:38 +03:00
|
|
|
return drivertype.None, 0, nil
|
2020-10-20 18:05:43 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-05-03 15:36:10 +03:00
|
|
|
// DetectJSONL implements source.DriverDetectFunc.
|
|
|
|
|
2024-01-15 04:45:34 +03:00
|
|
|
func ingestJSONL(ctx context.Context, job ingestJob) error { //nolint:gocognit
|
2023-05-03 15:36:10 +03:00
|
|
|
log := lg.FromContext(ctx)
|
2023-04-02 22:49:45 +03:00
|
|
|
|
2024-01-15 04:45:34 +03:00
|
|
|
r, err := job.openFn(ctx)
|
2020-10-20 18:05:43 +03:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2023-04-02 22:49:45 +03:00
|
|
|
defer lg.WarnIfCloseError(log, lgm.CloseFileReader, r)
|
2020-10-20 18:05:43 +03:00
|
|
|
|
2024-01-15 04:45:34 +03:00
|
|
|
drvr := job.destGrip.SQLDriver()
|
|
|
|
db, err := job.destGrip.DB(ctx)
|
2023-07-08 18:21:27 +03:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
conn, err := db.Conn(ctx)
|
2020-10-20 18:05:43 +03:00
|
|
|
if err != nil {
|
|
|
|
return errz.Err(err)
|
|
|
|
}
|
2023-07-08 18:21:27 +03:00
|
|
|
defer lg.WarnIfCloseError(log, lgm.CloseDB, conn)
|
2020-10-20 18:05:43 +03:00
|
|
|
|
|
|
|
proc := newProcessor(job.flatten)
|
|
|
|
scan := newLineScanner(ctx, r, '{')
|
|
|
|
|
|
|
|
var (
|
|
|
|
hasMore bool
|
|
|
|
schemaModified bool
|
|
|
|
line []byte
|
|
|
|
curSchema *importSchema
|
|
|
|
insertions []*insertion
|
|
|
|
)
|
|
|
|
|
|
|
|
for {
|
|
|
|
hasMore, line, err = scan.next()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if schemaModified {
|
|
|
|
if !hasMore || scan.validLineCount >= job.sampleSize {
|
2023-04-02 22:49:45 +03:00
|
|
|
log.Debug("Time to (re)build the schema", lga.Line, scan.totalLineCount)
|
2020-10-20 18:05:43 +03:00
|
|
|
if curSchema == nil {
|
|
|
|
log.Debug("First time building the schema")
|
|
|
|
}
|
|
|
|
|
|
|
|
var newSchema *importSchema
|
|
|
|
newSchema, err = proc.buildSchemaFlat()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2023-07-08 18:21:27 +03:00
|
|
|
err = execSchemaDelta(ctx, drvr, conn, curSchema, newSchema)
|
2020-10-20 18:05:43 +03:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// The DB has been updated with the current schema,
|
|
|
|
// so we mark it as clean.
|
|
|
|
proc.markSchemaClean()
|
|
|
|
|
|
|
|
curSchema = newSchema
|
|
|
|
newSchema = nil
|
|
|
|
|
|
|
|
insertions, err = proc.buildInsertionsFlat(curSchema)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2023-07-08 18:21:27 +03:00
|
|
|
err = execInsertions(ctx, drvr, conn, insertions)
|
2020-10-20 18:05:43 +03:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !hasMore {
|
|
|
|
// We're done
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-17 02:34:33 +03:00
|
|
|
var m map[string]any
|
2020-10-20 18:05:43 +03:00
|
|
|
dec := stdj.NewDecoder(bytes.NewReader(line))
|
|
|
|
|
|
|
|
err = dec.Decode(&m)
|
|
|
|
if err != nil {
|
2022-12-18 07:31:06 +03:00
|
|
|
if err == io.EOF { //nolint:errorlint
|
2020-10-20 18:05:43 +03:00
|
|
|
break
|
|
|
|
}
|
|
|
|
return errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
schemaModified, err = proc.processObject(m, line)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Initial schema has not been created: we're still in
|
|
|
|
// the sampling phase. So we loop.
|
|
|
|
if curSchema == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we got this far, the initial schema has already been created.
|
|
|
|
if schemaModified {
|
|
|
|
// But... the schema has been modified. We could still be in
|
|
|
|
// the sampling phase, so we loop.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// The schema exists in the DB, and the current JSON chunk hasn't
|
|
|
|
// dirtied the schema, so it's safe to insert the recent rows.
|
|
|
|
insertions, err = proc.buildInsertionsFlat(curSchema)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2023-07-08 18:21:27 +03:00
|
|
|
err = execInsertions(ctx, drvr, conn, insertions)
|
2020-10-20 18:05:43 +03:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if scan.validLineCount == 0 {
|
|
|
|
return errz.New("empty JSONL input")
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// lineScanner scans lines of JSON. Empty lines are skipped. Thus
|
|
|
|
// totalLineCount may be greater than validLineCount. If a non-empty
|
|
|
|
// line does not begin with requireAnchor, an error is returned.
|
|
|
|
type lineScanner struct {
|
|
|
|
ctx context.Context
|
|
|
|
sc *bufio.Scanner
|
|
|
|
requireAnchor byte
|
|
|
|
totalLineCount int
|
|
|
|
validLineCount int
|
|
|
|
}
|
|
|
|
|
|
|
|
func newLineScanner(ctx context.Context, r io.Reader, requireAnchor byte) *lineScanner {
|
|
|
|
return &lineScanner{ctx: ctx, sc: bufio.NewScanner(r), requireAnchor: requireAnchor}
|
|
|
|
}
|
|
|
|
|
|
|
|
// next returns the next non-empty line.
|
|
|
|
func (ls *lineScanner) next() (hasMore bool, line []byte, err error) {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ls.ctx.Done():
|
|
|
|
return false, nil, ls.ctx.Err()
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
|
|
|
|
hasMore = ls.sc.Scan()
|
|
|
|
if !hasMore {
|
|
|
|
return false, nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if err = ls.sc.Err(); err != nil {
|
|
|
|
return false, nil, errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
line = ls.sc.Bytes()
|
|
|
|
ls.totalLineCount++
|
|
|
|
if len(line) == 0 {
|
|
|
|
// Probably want to skip blank lines? Maybe
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if line[0] != ls.requireAnchor {
|
|
|
|
return false, nil, errz.Errorf("line %d expected to begin with '%s' but got '%s'",
|
|
|
|
ls.totalLineCount-1, string(ls.requireAnchor), string(line[0]))
|
|
|
|
}
|
|
|
|
|
|
|
|
ls.validLineCount++
|
|
|
|
return true, line, nil
|
|
|
|
}
|
|
|
|
}
|