sq/libsq/driver/ingest.go
Neil O'Toole 181e128a2d
Release wrap up v0.47.0 ()
* Misc cleanup pre-release
2024-01-28 14:55:51 -07:00

125 lines
3.8 KiB
Go

package driver
import (
"context"
"github.com/neilotoole/sq/libsq/core/errz"
"github.com/neilotoole/sq/libsq/core/options"
"github.com/neilotoole/sq/libsq/core/stringz"
)
// OptIngestHeader specifies whether ingested data has a header row or not.
// If not set, the ingester *may* try to detect if the input has a header.
var OptIngestHeader = options.NewBool(
"ingest.header",
"",
false,
0,
false,
"Ingest data has a header row",
`Specifies whether ingested data has a header row or not.
If not set, the ingester *may* try to detect if the input has a header.
Generally it is best to leave this option unset and allow the ingester
to detect the header.`,
options.TagSource,
options.TagIngestMutate,
)
// OptIngestCache specifies whether ingested data is cached or not.
var OptIngestCache = options.NewBool(
"ingest.cache",
"no-cache",
true,
0,
true,
"Cache ingest data",
`Specifies whether ingested data is cached or not, on a default or per-source
basis. When data is ingested from a document source, it is stored in a cache DB.
Subsequent uses of that same source will use that cached DB instead of ingesting
the data again, unless this option is set to false, in which case, the data is
ingested each time.
# Set default ingest caching behavior
$ sq config set ingest.cache false
# Set ingest caching behavior for a specific source
$ sq config set --src @sakila ingest.cache false
`,
options.TagSource,
)
// OptIngestSampleSize specifies the number of samples that a detector
// should take to determine ingest data type.
var OptIngestSampleSize = options.NewInt(
"ingest.sample-size",
"",
0,
256,
"Ingest data sample size for type detection",
`Specify the number of samples that a detector should take to determine type.`,
options.TagSource,
options.TagIngestMutate,
)
// OptIngestColRename transforms a column name in ingested data.
var OptIngestColRename = options.NewString(
"ingest.column.rename",
"",
0,
"{{.Name}}{{with .Recurrence}}_{{.}}{{end}}",
func(s string) error {
return stringz.ValidTemplate("ingest.column.rename", s)
},
"Template to rename ingest columns",
`This Go text template is executed on ingested column names.
Its primary purpose is to rename duplicate header column names in the
ingested data. For example, given a CSV file with header row:
actor_id, first_name, actor_id
The default template renames the columns to:
actor_id, first_name, actor_id_1
The fields available in the template are:
.Name column header name
.Index zero-based index of the column in the header row
.Alpha alphabetical index of the column, e.g. [A, B ... Z, AA, AB]
.Recurrence nth recurrence of the colum name in the header row
For a unique column name, e.g. "first_name" above, ".Recurrence" will be 0.
For duplicate column names, ".Recurrence" will be 0 for the first instance,
then 1 for the next instance, and so on.`,
options.TagSource,
options.TagIngestMutate,
)
// MungeIngestColNames transforms ingest data column names, per the template
// defined in the option driver.OptIngestColRename found on the context.
// It is the ingest counterpart of MungeResultColNames.
//
// For example, given a CSV file with header [actor_id, first_name, actor_id],
// the columns might be renamed to [actor_id, first_name, actor_id_1].
//
// MungeIngestColNames should be invoked by each ingester impl that may
// encounter duplicate col names in the ingest data.
func MungeIngestColNames(ctx context.Context, ogColNames []string) (colNames []string, err error) {
if len(ogColNames) == 0 {
return ogColNames, nil
}
o := options.FromContext(ctx)
tplText := OptIngestColRename.Get(o)
if tplText == "" {
return ogColNames, nil
}
tpl, err := stringz.NewTemplate(OptIngestColRename.Key(), tplText)
if err != nil {
return nil, errz.Wrap(err, "config: ")
}
return doMungeColNames(tpl, ogColNames)
}