sq/libsq/driver/ingest.go

package driver

import (
	"context"

	"github.com/neilotoole/sq/libsq/core/errz"
	"github.com/neilotoole/sq/libsq/core/options"
	"github.com/neilotoole/sq/libsq/core/stringz"
)

// OptIngestHeader specifies whether ingested data has a header row or not.
// If not set, the ingester *may* try to detect if the input has a header.
var OptIngestHeader = options.NewBool(
	"ingest.header",
	"",
	0,
	false,
	"Ingest data has a header row",
	`Specifies whether ingested data has a header row or not.
If not set, the ingester *may* try to detect if the input has a header.
Generally it is best to leave this option unset and allow the ingester
to detect the header.`,
	options.TagSource,
)

// OptIngestSampleSize specifies the number of samples that a detector
// should take to determine type.
var OptIngestSampleSize = options.NewInt(
	"ingest.sample-size",
	"",
	0,
	1024,
	"Ingest data sample size for type detection",
	`Specify the number of samples that a detector should take to determine type.`,
	options.TagSource,
)

// OptIngestColRename transforms a column name in ingested data.
var OptIngestColRename = options.NewString(
	"ingest.column.rename",
	"",
	0,
	"{{.Name}}{{with .Recurrence}}_{{.}}{{end}}",
	func(s string) error {
		return stringz.ValidTemplate("ingest.column.rename", s)
	},
	"Template to rename ingest columns",
	`This Go text template is executed on ingested column names.
Its primary purpose is to rename duplicate header column names in the
ingested data. For example, given a CSV file with header row:

  actor_id, first_name, actor_id

The default template renames the columns to:

  actor_id, first_name, actor_id_1

The fields available in the template are:

  .Name         column header name
  .Index        zero-based index of the column in the header row
  .Alpha        alphabetical index of the column, e.g. [A, B ... Z, AA, AB]
  .Recurrence   nth recurrence of the colum name in the header row

For a unique column name, e.g. "first_name" above, ".Recurrence" will be 0.
For duplicate column names, ".Recurrence" will be 0 for the first instance,
then 1 for the next instance, and so on.`,
	options.TagSource,
)

// MungeIngestColNames transforms ingest data column names, per the template
// defined in the option driver.OptIngestColRename found on the context.
// It is the ingest counterpart of MungeResultColNames.
//
// For example, given a CSV file with header [actor_id, first_name, actor_id],
// the columns might be renamed to [actor_id, first_name, actor_id_1].
//
// MungeIngestColNames should be invoked by each ingester impl that may
// encounter duplicate col names in the ingest data.
func MungeIngestColNames(ctx context.Context, ogColNames []string) (colNames []string, err error) {
	if len(ogColNames) == 0 {
		return ogColNames, nil
	}

	o := options.FromContext(ctx)
	tplText := OptIngestColRename.Get(o)
	if tplText == "" {
		return ogColNames, nil
	}

	tpl, err := stringz.NewTemplate(OptIngestColRename.Key(), tplText)
	if err != nil {
		return nil, errz.Wrap(err, "config: ")
	}

	return doMungeColNames(tpl, ogColNames)
}