sq/libsq/driver/ingest.go

98 lines
3.0 KiB
Go
Raw Normal View History

package driver
import (
"context"
"github.com/neilotoole/sq/libsq/core/errz"
"github.com/neilotoole/sq/libsq/core/options"
"github.com/neilotoole/sq/libsq/core/stringz"
)
// OptIngestHeader specifies whether ingested data has a header row or not.
// If not set, the ingester *may* try to detect if the input has a header.
var OptIngestHeader = options.NewBool(
"ingest.header",
"",
0,
false,
"Ingest data has a header row",
`Specifies whether ingested data has a header row or not.
If not set, the ingester *may* try to detect if the input has a header.
Generally it is best to leave this option unset and allow the ingester
to detect the header.`,
options.TagSource,
)
// OptIngestSampleSize specifies the number of samples that a detector
// should take to determine type.
var OptIngestSampleSize = options.NewInt(
"ingest.sample-size",
"",
0,
256,
"Ingest data sample size for type detection",
`Specify the number of samples that a detector should take to determine type.`,
options.TagSource,
)
// OptIngestColRename transforms a column name in ingested data.
var OptIngestColRename = options.NewString(
"ingest.column.rename",
"",
0,
"{{.Name}}{{with .Recurrence}}_{{.}}{{end}}",
func(s string) error {
return stringz.ValidTemplate("ingest.column.rename", s)
},
"Template to rename ingest columns",
`This Go text template is executed on ingested column names.
Its primary purpose is to rename duplicate header column names in the
ingested data. For example, given a CSV file with header row:
actor_id, first_name, actor_id
The default template renames the columns to:
actor_id, first_name, actor_id_1
The fields available in the template are:
.Name column header name
.Index zero-based index of the column in the header row
.Alpha alphabetical index of the column, e.g. [A, B ... Z, AA, AB]
.Recurrence nth recurrence of the colum name in the header row
For a unique column name, e.g. "first_name" above, ".Recurrence" will be 0.
For duplicate column names, ".Recurrence" will be 0 for the first instance,
then 1 for the next instance, and so on.`,
options.TagSource,
)
// MungeIngestColNames transforms ingest data column names, per the template
// defined in the option driver.OptIngestColRename found on the context.
// It is the ingest counterpart of MungeResultColNames.
//
// For example, given a CSV file with header [actor_id, first_name, actor_id],
// the columns might be renamed to [actor_id, first_name, actor_id_1].
//
// MungeIngestColNames should be invoked by each ingester impl that may
// encounter duplicate col names in the ingest data.
func MungeIngestColNames(ctx context.Context, ogColNames []string) (colNames []string, err error) {
if len(ogColNames) == 0 {
return ogColNames, nil
}
o := options.FromContext(ctx)
tplText := OptIngestColRename.Get(o)
if tplText == "" {
return ogColNames, nil
}
tpl, err := stringz.NewTemplate(OptIngestColRename.Key(), tplText)
if err != nil {
return nil, errz.Wrap(err, "config: ")
}
return doMungeColNames(tpl, ogColNames)
}