2023-07-04 20:31:47 +03:00
|
|
|
package driver
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
|
|
|
|
"github.com/neilotoole/sq/libsq/core/errz"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/options"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/stringz"
|
|
|
|
)
|
|
|
|
|
|
|
|
// OptIngestHeader specifies whether ingested data has a header row or not.
|
|
|
|
// If not set, the ingester *may* try to detect if the input has a header.
|
|
|
|
var OptIngestHeader = options.NewBool(
|
|
|
|
"ingest.header",
|
|
|
|
"",
|
2024-01-15 04:45:34 +03:00
|
|
|
false,
|
2023-07-04 20:31:47 +03:00
|
|
|
0,
|
|
|
|
false,
|
|
|
|
"Ingest data has a header row",
|
|
|
|
`Specifies whether ingested data has a header row or not.
|
|
|
|
If not set, the ingester *may* try to detect if the input has a header.
|
|
|
|
Generally it is best to leave this option unset and allow the ingester
|
|
|
|
to detect the header.`,
|
2023-07-09 04:34:53 +03:00
|
|
|
options.TagSource,
|
2024-01-15 04:45:34 +03:00
|
|
|
options.TagIngestMutate,
|
|
|
|
)
|
|
|
|
|
|
|
|
// OptIngestCache specifies whether ingested data is cached or not.
|
|
|
|
var OptIngestCache = options.NewBool(
|
|
|
|
"ingest.cache",
|
|
|
|
"no-cache",
|
|
|
|
true,
|
|
|
|
0,
|
|
|
|
true,
|
|
|
|
"Cache ingest data",
|
2024-01-29 00:55:51 +03:00
|
|
|
`Specifies whether ingested data is cached or not, on a default or per-source
|
|
|
|
basis. When data is ingested from a document source, it is stored in a cache DB.
|
|
|
|
Subsequent uses of that same source will use that cached DB instead of ingesting
|
|
|
|
the data again, unless this option is set to false, in which case, the data is
|
|
|
|
ingested each time.
|
|
|
|
|
|
|
|
# Set default ingest caching behavior
|
|
|
|
$ sq config set ingest.cache false
|
|
|
|
|
|
|
|
# Set ingest caching behavior for a specific source
|
|
|
|
$ sq config set --src @sakila ingest.cache false
|
|
|
|
`,
|
2024-01-15 04:45:34 +03:00
|
|
|
options.TagSource,
|
2023-07-04 20:31:47 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
// OptIngestSampleSize specifies the number of samples that a detector
|
2024-01-15 04:45:34 +03:00
|
|
|
// should take to determine ingest data type.
|
2023-07-04 20:31:47 +03:00
|
|
|
var OptIngestSampleSize = options.NewInt(
|
|
|
|
"ingest.sample-size",
|
|
|
|
"",
|
|
|
|
0,
|
2023-08-16 18:09:50 +03:00
|
|
|
256,
|
2023-07-04 20:31:47 +03:00
|
|
|
"Ingest data sample size for type detection",
|
|
|
|
`Specify the number of samples that a detector should take to determine type.`,
|
2023-07-09 04:34:53 +03:00
|
|
|
options.TagSource,
|
2024-01-15 04:45:34 +03:00
|
|
|
options.TagIngestMutate,
|
2023-07-04 20:31:47 +03:00
|
|
|
)
|
|
|
|
|
2023-07-09 04:34:53 +03:00
|
|
|
// OptIngestColRename transforms a column name in ingested data.
|
2023-07-04 20:31:47 +03:00
|
|
|
var OptIngestColRename = options.NewString(
|
|
|
|
"ingest.column.rename",
|
|
|
|
"",
|
|
|
|
0,
|
|
|
|
"{{.Name}}{{with .Recurrence}}_{{.}}{{end}}",
|
|
|
|
func(s string) error {
|
|
|
|
return stringz.ValidTemplate("ingest.column.rename", s)
|
|
|
|
},
|
2023-07-09 04:34:53 +03:00
|
|
|
"Template to rename ingest columns",
|
2023-07-04 20:31:47 +03:00
|
|
|
`This Go text template is executed on ingested column names.
|
2023-07-09 04:34:53 +03:00
|
|
|
Its primary purpose is to rename duplicate header column names in the
|
|
|
|
ingested data. For example, given a CSV file with header row:
|
2023-07-04 20:31:47 +03:00
|
|
|
|
|
|
|
actor_id, first_name, actor_id
|
|
|
|
|
|
|
|
The default template renames the columns to:
|
|
|
|
|
|
|
|
actor_id, first_name, actor_id_1
|
|
|
|
|
|
|
|
The fields available in the template are:
|
|
|
|
|
|
|
|
.Name column header name
|
|
|
|
.Index zero-based index of the column in the header row
|
2023-07-09 04:34:53 +03:00
|
|
|
.Alpha alphabetical index of the column, e.g. [A, B ... Z, AA, AB]
|
2023-07-04 20:31:47 +03:00
|
|
|
.Recurrence nth recurrence of the colum name in the header row
|
|
|
|
|
|
|
|
For a unique column name, e.g. "first_name" above, ".Recurrence" will be 0.
|
|
|
|
For duplicate column names, ".Recurrence" will be 0 for the first instance,
|
|
|
|
then 1 for the next instance, and so on.`,
|
2023-07-09 04:34:53 +03:00
|
|
|
options.TagSource,
|
2024-01-15 04:45:34 +03:00
|
|
|
options.TagIngestMutate,
|
2023-07-04 20:31:47 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
// MungeIngestColNames transforms ingest data column names, per the template
|
|
|
|
// defined in the option driver.OptIngestColRename found on the context.
|
|
|
|
// It is the ingest counterpart of MungeResultColNames.
|
|
|
|
//
|
|
|
|
// For example, given a CSV file with header [actor_id, first_name, actor_id],
|
|
|
|
// the columns might be renamed to [actor_id, first_name, actor_id_1].
|
|
|
|
//
|
|
|
|
// MungeIngestColNames should be invoked by each ingester impl that may
|
|
|
|
// encounter duplicate col names in the ingest data.
|
|
|
|
func MungeIngestColNames(ctx context.Context, ogColNames []string) (colNames []string, err error) {
|
|
|
|
if len(ogColNames) == 0 {
|
|
|
|
return ogColNames, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
o := options.FromContext(ctx)
|
|
|
|
tplText := OptIngestColRename.Get(o)
|
|
|
|
if tplText == "" {
|
|
|
|
return ogColNames, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
tpl, err := stringz.NewTemplate(OptIngestColRename.Key(), tplText)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errz.Wrap(err, "config: ")
|
|
|
|
}
|
|
|
|
|
|
|
|
return doMungeColNames(tpl, ogColNames)
|
|
|
|
}
|