mirror of
https://github.com/neilotoole/sq.git
synced 2024-12-21 07:01:41 +03:00
9c5836ef1c
* xlsx driver now detects header row.
457 lines
9.2 KiB
Go
457 lines
9.2 KiB
Go
package kind
|
|
|
|
import (
|
|
stdj "encoding/json"
|
|
"math/big"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/timez"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/errz"
|
|
"github.com/neilotoole/sq/libsq/core/stringz"
|
|
)
|
|
|
|
// Detector is used to detect the kind of a stream of values.
|
|
// The caller adds values via Sample and then invokes Detect.
|
|
type Detector struct {
|
|
kinds map[Kind]struct{}
|
|
mungeFns map[Kind]MungeFunc
|
|
dirty bool
|
|
|
|
// foundString is set to true if any of the values passed
|
|
// to Detector.Sample had type string.
|
|
foundString bool
|
|
}
|
|
|
|
// NewDetector returns a new instance.
|
|
func NewDetector() *Detector {
|
|
return &Detector{
|
|
kinds: map[Kind]struct{}{ //nolint:exhaustive
|
|
Int: {},
|
|
Float: {},
|
|
Decimal: {},
|
|
Bool: {},
|
|
Time: {},
|
|
Date: {},
|
|
Datetime: {},
|
|
},
|
|
mungeFns: map[Kind]MungeFunc{},
|
|
}
|
|
}
|
|
|
|
// Sample adds a sample to the detector.
|
|
func (d *Detector) Sample(v any) {
|
|
switch v.(type) {
|
|
case nil:
|
|
// Can't glean any info from nil
|
|
return
|
|
default:
|
|
// Don't know what this, so delete all kinds
|
|
d.retain()
|
|
return
|
|
case float32, float64:
|
|
d.retain(Float, Decimal)
|
|
return
|
|
case int, int8, int16, int32, int64:
|
|
d.retain(Int, Float, Decimal)
|
|
return
|
|
case bool:
|
|
d.retain(Bool)
|
|
return
|
|
case time.Time:
|
|
d.retain(Time, Date, Datetime)
|
|
return
|
|
case stdj.Number:
|
|
// JSON number
|
|
d.foundString = true
|
|
d.retain(Decimal)
|
|
return
|
|
case string:
|
|
// We need to do more work to figure out the kind when
|
|
// we're getting string values
|
|
d.foundString = true
|
|
}
|
|
|
|
// We're dealing with a string value, which could a variety
|
|
// of things, such as: "1", "1.0", "true", "11:30".
|
|
d.doSampleString(v.(string))
|
|
}
|
|
|
|
//nolint:gocognit,funlen
|
|
func (d *Detector) doSampleString(s string) {
|
|
if s == "" {
|
|
// Can't really do anything useful with this
|
|
return
|
|
}
|
|
|
|
var err error
|
|
|
|
if d.has(Int) || d.has(Decimal) {
|
|
if strings.ContainsRune(s, '.') {
|
|
// Int cannot contain '.', e.g. "1.0".
|
|
d.delete(Int)
|
|
}
|
|
|
|
if strings.ContainsAny(s, "eE") {
|
|
// Int and Decimal cannot contain E or e.
|
|
// Most likely a float, e.g. "6.67428e-11"
|
|
d.delete(Int, Decimal)
|
|
}
|
|
}
|
|
|
|
if d.has(Decimal) {
|
|
// If Decimal is still a candidate, check that we can parse it
|
|
if _, _, err = big.ParseFloat(s, 10, 64, 0); err != nil {
|
|
// If s cannot be parsed as a decimal, it also can't
|
|
// be int or float
|
|
d.delete(Decimal, Int, Float)
|
|
} else {
|
|
// s can be parsed as decimal, can't be time
|
|
d.delete(Time, Date, Datetime)
|
|
}
|
|
}
|
|
|
|
if d.has(Int) {
|
|
if _, err = strconv.ParseInt(s, 10, 64); err != nil {
|
|
d.delete(Int)
|
|
} else {
|
|
// s can be parsed as int, can't be time
|
|
d.delete(Time, Date, Datetime)
|
|
}
|
|
}
|
|
|
|
if d.has(Float) {
|
|
if _, err = strconv.ParseFloat(s, 64); err != nil {
|
|
d.delete(Float)
|
|
} else {
|
|
// s can be parsed as float, can't be time
|
|
d.delete(Time, Date, Datetime)
|
|
}
|
|
}
|
|
|
|
if d.has(Bool) {
|
|
if _, err = stringz.ParseBool(s); err != nil {
|
|
d.delete(Bool)
|
|
} else {
|
|
// s can be parsed as bool, can't be time,
|
|
// but still could be a number ("1" == true)
|
|
d.delete(Time, Date, Datetime)
|
|
}
|
|
}
|
|
|
|
if d.has(Time) {
|
|
ok, format := detectKindTime(s)
|
|
if !ok {
|
|
// It's not a recognized time format
|
|
d.delete(Time)
|
|
} else {
|
|
// If it's kind.Time, it can't be anything else
|
|
d.retain(Time)
|
|
|
|
d.mungeFns[Time] = func(val any) (any, error) {
|
|
if val == nil {
|
|
return nil, nil //nolint:nilnil
|
|
}
|
|
|
|
s, ok = val.(string)
|
|
if !ok {
|
|
return nil, errz.Errorf("expected %T to be string", val)
|
|
}
|
|
|
|
if s == "" {
|
|
return nil, nil //nolint:nilnil
|
|
}
|
|
|
|
var t time.Time
|
|
t, err = time.Parse(format, s)
|
|
if err != nil {
|
|
return nil, errz.Err(err)
|
|
}
|
|
|
|
return t.Format(format), nil
|
|
}
|
|
}
|
|
}
|
|
|
|
if d.has(Date) {
|
|
ok, format := detectKindDate(s)
|
|
if !ok {
|
|
// It's not a recognized date format
|
|
d.delete(Date)
|
|
} else {
|
|
// If it's kind.Date, it can't be anything else
|
|
d.retain(Date)
|
|
|
|
d.mungeFns[Date] = func(val any) (any, error) {
|
|
if val == nil {
|
|
return nil, nil //nolint:nilnil
|
|
}
|
|
|
|
s, ok = val.(string)
|
|
if !ok {
|
|
return nil, errz.Errorf("expected %T to be string", val)
|
|
}
|
|
|
|
if s == "" {
|
|
return nil, nil //nolint:nilnil
|
|
}
|
|
|
|
var t time.Time
|
|
t, err = time.Parse(format, s)
|
|
if err != nil {
|
|
return nil, errz.Err(err)
|
|
}
|
|
|
|
return t.Format(format), nil
|
|
}
|
|
}
|
|
}
|
|
|
|
if d.has(Datetime) {
|
|
ok, format := detectKindDatetime(s)
|
|
if !ok {
|
|
// It's not a recognized datetime format
|
|
d.delete(Datetime)
|
|
} else {
|
|
// If it's kind.Datetime, it can't be anything else
|
|
d.retain(Datetime)
|
|
|
|
// This mungeFn differs from kind.Date and kind.Time in that
|
|
// it returns a time.Time instead of a string
|
|
d.mungeFns[Datetime] = func(val any) (any, error) {
|
|
if val == nil {
|
|
return nil, nil //nolint:nilnil
|
|
}
|
|
|
|
s, ok := val.(string)
|
|
if !ok {
|
|
return nil, errz.Errorf("expected %T to be string", val)
|
|
}
|
|
|
|
if s == "" {
|
|
return nil, nil //nolint:nilnil
|
|
}
|
|
|
|
t, err := time.Parse(format, s)
|
|
if err != nil {
|
|
return nil, errz.Err(err)
|
|
}
|
|
|
|
return t, nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Detect returns the detected Kind. If ambiguous, Text is returned,
|
|
// unless all sampled values were nil, in which case Null is returned.
|
|
// If the returned mungeFn is non-nil, it can be used to convert input
|
|
// values to their canonical form. For example, for Datetime the MungeFunc
|
|
// would accept string "2020-06-11T02:50:54Z" and return a time.Time,
|
|
// while for Date, the MungeFunc would accept "1970-01-01" or "01 Jan 1970"
|
|
// and always return a string in the canonicalized form "1970-01-01".
|
|
func (d *Detector) Detect() (kind Kind, mungeFn MungeFunc, err error) {
|
|
if !d.dirty {
|
|
if d.foundString {
|
|
return Text, nil, nil
|
|
}
|
|
|
|
// If we haven't filtered any kinds, default to Null.
|
|
return Null, nil, nil
|
|
}
|
|
|
|
switch len(d.kinds) {
|
|
case 0:
|
|
return Text, nil, nil
|
|
case 1:
|
|
for k := range d.kinds {
|
|
return k, d.mungeFns[k], nil
|
|
}
|
|
}
|
|
|
|
// NOTE: this logic below about detecting the remaining type
|
|
// is a bit sketchy. If you're debugging this code, it's
|
|
// probably the case that the code below is faulty.
|
|
if d.has(Time, Date, Datetime) {
|
|
// If all three time types are left, use the most
|
|
// general, i.e. Datetime.
|
|
return Datetime, d.mungeFns[Datetime], nil
|
|
}
|
|
|
|
if d.has(Time) {
|
|
return Time, d.mungeFns[Time], nil
|
|
}
|
|
|
|
if d.has(Date) {
|
|
return Date, d.mungeFns[Date], nil
|
|
}
|
|
|
|
if d.has(Datetime) {
|
|
return Datetime, d.mungeFns[Datetime], nil
|
|
}
|
|
|
|
if d.foundString {
|
|
if d.has(Decimal, Int) {
|
|
// If we have to choose between Decimal and Int, we
|
|
// pick Int, as it's stricter.
|
|
return Int, nil, nil
|
|
}
|
|
|
|
if d.has(Decimal) {
|
|
return Decimal, nil, nil
|
|
}
|
|
}
|
|
|
|
if d.has(Int) {
|
|
return Int, nil, nil
|
|
}
|
|
|
|
if d.has(Float) {
|
|
return Float, nil, nil
|
|
}
|
|
|
|
if d.has(Bool) {
|
|
return Bool, nil, nil
|
|
}
|
|
|
|
return Text, nil, nil
|
|
}
|
|
|
|
// delete deletes each of kinds from d.kinds.
|
|
func (d *Detector) delete(kinds ...Kind) {
|
|
d.dirty = true
|
|
for _, k := range kinds {
|
|
delete(d.kinds, k)
|
|
}
|
|
}
|
|
|
|
// retain deletes everything from kd.kinds except items
|
|
// contains in the kinds arg. If kinds is empty, d.kinds is
|
|
// emptied.
|
|
func (d *Detector) retain(kinds ...Kind) {
|
|
d.dirty = true
|
|
for k := range d.kinds {
|
|
if !containsKind(k, kinds...) {
|
|
delete(d.kinds, k)
|
|
}
|
|
}
|
|
}
|
|
|
|
// has returns true if kd.kinds contains each of k.
|
|
func (d *Detector) has(kinds ...Kind) bool {
|
|
var ok bool
|
|
for _, k := range kinds {
|
|
if _, ok = d.kinds[k]; !ok {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func detectKindTime(s string) (ok bool, format string) {
|
|
if s == "" {
|
|
return false, ""
|
|
}
|
|
|
|
const timeNoSecsFormat = "15:04"
|
|
formats := []string{time.TimeOnly, timeNoSecsFormat, time.Kitchen}
|
|
var err error
|
|
|
|
for _, f := range formats {
|
|
if _, err = time.Parse(f, s); err == nil {
|
|
return true, f
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func detectKindDate(s string) (ok bool, format string) {
|
|
if s == "" {
|
|
return false, ""
|
|
}
|
|
|
|
const (
|
|
format1 = "02 Jan 2006"
|
|
format2 = "2006/01/02"
|
|
format3 = "2006-01-02"
|
|
)
|
|
|
|
formats := []string{time.DateOnly, format1, format2, format3}
|
|
var err error
|
|
|
|
for _, f := range formats {
|
|
if _, err = time.Parse(f, s); err == nil {
|
|
return true, f
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
func detectKindDatetime(s string) (ok bool, format string) {
|
|
if s == "" {
|
|
return false, ""
|
|
}
|
|
|
|
formats := []string{
|
|
time.RFC3339,
|
|
timez.RFC3339Z,
|
|
timez.ISO8601,
|
|
time.RFC3339Nano,
|
|
time.ANSIC,
|
|
time.UnixDate,
|
|
time.RubyDate,
|
|
time.RFC822,
|
|
time.RFC822Z,
|
|
time.RFC850,
|
|
time.RFC1123,
|
|
time.RFC1123Z,
|
|
time.Stamp,
|
|
time.StampMilli,
|
|
time.StampMicro,
|
|
time.StampNano,
|
|
}
|
|
var err error
|
|
|
|
for _, f := range formats {
|
|
if _, err = time.Parse(f, s); err == nil {
|
|
return true, f
|
|
}
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
// Hash generates a hash from the kinds returned by
|
|
// the detectors. The detectors should already have
|
|
// sampled data.
|
|
func Hash(detectors []*Detector) (h string, err error) {
|
|
if len(detectors) == 0 {
|
|
return "", errz.New("no kind detectors")
|
|
}
|
|
|
|
kinds := make([]Kind, len(detectors))
|
|
for i := range detectors {
|
|
kinds[i], _, err = detectors[i].Detect()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
}
|
|
|
|
// TODO: use an actual hash function
|
|
hash := strings.Builder{}
|
|
for i := range kinds {
|
|
if i > 0 {
|
|
hash.WriteRune('|')
|
|
}
|
|
hash.WriteString(kinds[i].String())
|
|
}
|
|
|
|
h = hash.String()
|
|
return h, nil
|
|
}
|