sq/drivers/csv/detect_header.go

package csv

import (
	"context"

	"github.com/neilotoole/sq/libsq/driver"

	"github.com/neilotoole/sq/libsq/core/lg"
	"github.com/neilotoole/sq/libsq/core/lg/lga"

	"github.com/neilotoole/sq/libsq/core/options"

	"github.com/neilotoole/sq/libsq/core/errz"
	"github.com/neilotoole/sq/libsq/core/kind"
)

// hasHeaderRow returns true if a header row is explicitly
// set in opts, or if detectHeaderRow detects that the first
// row of recs seems to be a header.
func hasHeaderRow(ctx context.Context, recs [][]string, opts options.Options) (bool, error) {
	if driver.OptIngestHeader.IsSet(opts) {
		b := driver.OptIngestHeader.Get(opts)
		lg.FromContext(ctx).Debug("CSV ingest header explicitly specified: skipping header detection",
			lga.Key, driver.OptIngestHeader.Key(),
			lga.Val, b)
		return b, nil
	}

	return detectHeaderRow(recs)
}

// detectHeaderRow returns true if recs has a header row.
// The recs arg should be regularly shaped: each rec should
// have the same number of fields.
func detectHeaderRow(recs [][]string) (hasHeader bool, err error) {
	if len(recs) < 2 {
		// If zero records, obviously no header row.
		// If one record... well, is there any way of determining if
		// it's a header row or not? Probably best to treat it as a data row.
		return false, nil
	}

	firstRowHash, err := calcKindHash(recs[0:1])
	if err != nil {
		return false, err
	}

	remainderHash, err := calcKindHash(recs[1:])
	if err != nil {
		return false, err
	}

	if firstRowHash != remainderHash {
		return true, nil
	}

	return false, nil
}

func calcKindHash(recs [][]string) (string, error) {
	if len(recs) == 0 || len(recs[0]) == 0 {
		return "", errz.New("no records")
	}

	fieldCount := len(recs[0])

	detectors := make([]*kind.Detector, len(recs[0]))
	for i := 0; i < fieldCount; i++ {
		detectors[i] = kind.NewDetector()
	}

	for i := range recs {
		for j := range recs[i] {
			detectors[j].Sample(recs[i][j])
		}
	}

	return kind.Hash(detectors)
}
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`package csv`

			`import (`
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`"context"`
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`"github.com/neilotoole/sq/libsq/driver"`

#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`"github.com/neilotoole/sq/libsq/core/lg"`
			`"github.com/neilotoole/sq/libsq/core/lg/lga"`

Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`"github.com/neilotoole/sq/libsq/core/options"`

			`"github.com/neilotoole/sq/libsq/core/errz"`
			`"github.com/neilotoole/sq/libsq/core/kind"`
			`)`

			`// hasHeaderRow returns true if a header row is explicitly`
			`// set in opts, or if detectHeaderRow detects that the first`
			`// row of recs seems to be a header.`
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`func hasHeaderRow(ctx context.Context, recs [][]string, opts options.Options) (bool, error) {`
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`if driver.OptIngestHeader.IsSet(opts) {`
			`b := driver.OptIngestHeader.Get(opts)`
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`lg.FromContext(ctx).Debug("CSV ingest header explicitly specified: skipping header detection",`
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`lga.Key, driver.OptIngestHeader.Key(),`
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`lga.Val, b)`
			`return b, nil`
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`}`

			`return detectHeaderRow(recs)`
			`}`

			`// detectHeaderRow returns true if recs has a header row.`
			`// The recs arg should be regularly shaped: each rec should`
			`// have the same number of fields.`
			`func detectHeaderRow(recs [][]string) (hasHeader bool, err error) {`
			`if len(recs) < 2 {`
			`// If zero records, obviously no header row.`
			`// If one record... well, is there any way of determining if`
			`// it's a header row or not? Probably best to treat it as a data row.`
			`return false, nil`
			`}`

			`firstRowHash, err := calcKindHash(recs[0:1])`
			`if err != nil {`
			`return false, err`
			`}`

			`remainderHash, err := calcKindHash(recs[1:])`
			`if err != nil {`
			`return false, err`
			`}`

			`if firstRowHash != remainderHash {`
			`return true, nil`
			`}`

			`return false, nil`
			`}`

			`func calcKindHash(recs [][]string) (string, error) {`
			`if len(recs) == 0 \|\| len(recs[0]) == 0 {`
			`return "", errz.New("no records")`
			`}`

			`fieldCount := len(recs[0])`

			`detectors := make([]*kind.Detector, len(recs[0]))`
			`for i := 0; i < fieldCount; i++ {`
			`detectors[i] = kind.NewDetector()`
			`}`

			`for i := range recs {`
			`for j := range recs[i] {`
			`detectors[j].Sample(recs[i][j])`
			`}`
			`}`

#191: XLSX driver auto-detects header row (#284) * xlsx driver now detects header row. 2023-07-08 18:21:27 +03:00			`return kind.Hash(detectors)`
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`}`