mirror of
https://github.com/neilotoole/sq.git
synced 2024-12-24 00:22:57 +03:00
4ffaae925f
* CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar
112 lines
2.6 KiB
Go
112 lines
2.6 KiB
Go
package csv
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
|
|
"github.com/neilotoole/sq/libsq/driver"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/lg"
|
|
"github.com/neilotoole/sq/libsq/core/lg/lga"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/options"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/errz"
|
|
"github.com/neilotoole/sq/libsq/core/kind"
|
|
)
|
|
|
|
// hasHeaderRow returns true if a header row is explicitly
|
|
// set in opts, or if detectHeaderRow detects that the first
|
|
// row of recs seems to be a header.
|
|
func hasHeaderRow(ctx context.Context, recs [][]string, opts options.Options) (bool, error) {
|
|
if driver.OptIngestHeader.IsSet(opts) {
|
|
b := driver.OptIngestHeader.Get(opts)
|
|
lg.FromContext(ctx).Debug("CSV ingest header explicitly specified: skipping header detection",
|
|
lga.Key, driver.OptIngestHeader.Key(),
|
|
lga.Val, b)
|
|
return b, nil
|
|
}
|
|
|
|
return detectHeaderRow(recs)
|
|
}
|
|
|
|
// detectHeaderRow returns true if recs has a header row.
|
|
// The recs arg should be regularly shaped: each rec should
|
|
// have the same number of fields.
|
|
func detectHeaderRow(recs [][]string) (hasHeader bool, err error) {
|
|
if len(recs) < 2 {
|
|
// If zero records, obviously no header row.
|
|
// If one record... well, is there any way of determining if
|
|
// it's a header row or not? Probably best to treat it as a data row.
|
|
return false, nil
|
|
}
|
|
|
|
firstRowHash, err := calcKindHash(recs[0:1])
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
remainderHash, err := calcKindHash(recs[1:])
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
if firstRowHash != remainderHash {
|
|
return true, nil
|
|
}
|
|
|
|
return false, nil
|
|
}
|
|
|
|
// Hash generates a hash from the kinds returned by
|
|
// the detectors. The detectors should already have
|
|
// sampled data.
|
|
//
|
|
// TODO: move Hash to pkg libsq/core/kind?
|
|
func Hash(detectors []*kind.Detector) (h string, err error) {
|
|
if len(detectors) == 0 {
|
|
return "", errz.New("no kind detectors")
|
|
}
|
|
|
|
kinds := make([]kind.Kind, len(detectors))
|
|
for i := range detectors {
|
|
kinds[i], _, err = detectors[i].Detect()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
}
|
|
|
|
// TODO: use an actual hash function
|
|
hash := strings.Builder{}
|
|
for i := range kinds {
|
|
if i > 0 {
|
|
hash.WriteRune('|')
|
|
}
|
|
hash.WriteString(kinds[i].String())
|
|
}
|
|
|
|
h = hash.String()
|
|
return h, nil
|
|
}
|
|
|
|
func calcKindHash(recs [][]string) (string, error) {
|
|
if len(recs) == 0 || len(recs[0]) == 0 {
|
|
return "", errz.New("no records")
|
|
}
|
|
|
|
fieldCount := len(recs[0])
|
|
|
|
detectors := make([]*kind.Detector, len(recs[0]))
|
|
for i := 0; i < fieldCount; i++ {
|
|
detectors[i] = kind.NewDetector()
|
|
}
|
|
|
|
for i := range recs {
|
|
for j := range recs[i] {
|
|
detectors[j].Sample(recs[i][j])
|
|
}
|
|
}
|
|
|
|
return Hash(detectors)
|
|
}
|