2020-08-23 22:00:13 +03:00
|
|
|
// Package kind encapsulates the notion of data "kind": that is, it
|
|
|
|
// is an abstraction over data types across implementations.
|
2020-08-23 13:42:15 +03:00
|
|
|
package kind
|
|
|
|
|
|
|
|
import (
|
|
|
|
stdj "encoding/json"
|
|
|
|
"math/big"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/neilotoole/sq/libsq/core/errz"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/stringz"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
// Unknown indicates an unknown kind.
|
|
|
|
Unknown Kind = iota
|
|
|
|
|
|
|
|
// Null indicates a NULL kind.
|
|
|
|
Null
|
|
|
|
|
|
|
|
// Text indicates a text kind.
|
|
|
|
Text
|
|
|
|
|
|
|
|
// Int indicates an integer kind.
|
|
|
|
Int
|
|
|
|
|
|
|
|
// Float indicates a float kind.
|
|
|
|
Float
|
|
|
|
|
|
|
|
// Decimal indicates a decimal kind.
|
|
|
|
Decimal
|
|
|
|
|
|
|
|
// Bool indicates a boolean kind.
|
|
|
|
Bool
|
|
|
|
|
|
|
|
// Bytes indicates a bytes or blob kind.
|
|
|
|
Bytes
|
|
|
|
|
|
|
|
// Datetime indicates a date-time kind.
|
|
|
|
Datetime
|
|
|
|
|
|
|
|
// Date indicates a date-only kind.
|
|
|
|
Date
|
|
|
|
|
|
|
|
// Time indicates a time-only kind.
|
|
|
|
Time
|
|
|
|
)
|
|
|
|
|
|
|
|
// Kind models a generic data kind, which ultimately maps
|
|
|
|
// to some more specific implementation data type,
|
|
|
|
// such as a SQL VARCHAR or JSON boolean.
|
|
|
|
type Kind int
|
|
|
|
|
|
|
|
func (k Kind) String() string {
|
|
|
|
t, err := k.MarshalText()
|
|
|
|
if err != nil {
|
|
|
|
return "<err>"
|
|
|
|
}
|
|
|
|
|
|
|
|
return string(t)
|
|
|
|
}
|
|
|
|
|
|
|
|
// MarshalJSON implements json.Marshaler.
|
|
|
|
func (k Kind) MarshalJSON() ([]byte, error) {
|
|
|
|
t, err := k.MarshalText()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return []byte(`"` + string(t) + `"`), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// MarshalText implements encoding.TextMarshaler.
|
|
|
|
func (k Kind) MarshalText() ([]byte, error) {
|
|
|
|
var name string
|
|
|
|
switch k {
|
|
|
|
case Unknown:
|
|
|
|
name = "unknown"
|
|
|
|
case Null:
|
|
|
|
name = "null"
|
|
|
|
case Text:
|
|
|
|
name = "text"
|
|
|
|
case Int:
|
|
|
|
name = "int"
|
|
|
|
case Float:
|
|
|
|
name = "float"
|
|
|
|
case Decimal:
|
|
|
|
name = "decimal"
|
|
|
|
case Bool:
|
|
|
|
name = "bool"
|
|
|
|
case Datetime:
|
|
|
|
name = "datetime"
|
|
|
|
case Date:
|
|
|
|
name = "date"
|
|
|
|
case Time:
|
|
|
|
name = "time"
|
|
|
|
case Bytes:
|
|
|
|
name = "bytes"
|
|
|
|
default:
|
|
|
|
return nil, errz.Errorf("invalid data kind '%d'", k)
|
|
|
|
}
|
|
|
|
|
|
|
|
return []byte(name), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// UnmarshalText implements encoding.TextUnmarshaler.
|
|
|
|
func (k *Kind) UnmarshalText(text []byte) error {
|
|
|
|
kind, err := parse(string(text))
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
*k = kind
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// parse parses text and returns the appropriate kind, or
|
|
|
|
// an error.
|
|
|
|
func parse(text string) (Kind, error) {
|
|
|
|
switch strings.ToLower(text) {
|
|
|
|
default:
|
|
|
|
return Unknown, errz.Errorf("unrecognized kind name %q", text)
|
|
|
|
case "unknown":
|
|
|
|
return Unknown, nil
|
|
|
|
case "text":
|
|
|
|
return Text, nil
|
|
|
|
case "int":
|
|
|
|
return Int, nil
|
|
|
|
case "float":
|
|
|
|
return Float, nil
|
|
|
|
case "decimal":
|
|
|
|
return Decimal, nil
|
|
|
|
case "bool":
|
|
|
|
return Bool, nil
|
|
|
|
case "datetime":
|
|
|
|
return Datetime, nil
|
|
|
|
case "date":
|
|
|
|
return Date, nil
|
|
|
|
case "time":
|
|
|
|
return Time, nil
|
|
|
|
case "bytes":
|
|
|
|
return Bytes, nil
|
|
|
|
case "null":
|
|
|
|
return Null, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Detector is used to detect the kind of a stream of values.
|
|
|
|
// The caller adds values via Sample and then invokes Detect.
|
|
|
|
type Detector struct {
|
|
|
|
kinds map[Kind]struct{}
|
2020-10-20 18:05:43 +03:00
|
|
|
mungeFns map[Kind]MungeFunc
|
2020-08-23 13:42:15 +03:00
|
|
|
dirty bool
|
|
|
|
foundString bool
|
|
|
|
}
|
|
|
|
|
2020-10-20 18:05:43 +03:00
|
|
|
// MungeFunc is a function that accepts a value and returns a munged
|
|
|
|
// value with the appropriate Kind. For example, a Datetime MungeFunc
|
|
|
|
// would accept string "2020-06-11T02:50:54Z" and return a time.Time,
|
|
|
|
type MungeFunc func(interface{}) (interface{}, error)
|
|
|
|
|
2020-08-23 13:42:15 +03:00
|
|
|
// NewDetector returns a new instance.
|
|
|
|
func NewDetector() *Detector {
|
|
|
|
return &Detector{
|
|
|
|
kinds: map[Kind]struct{}{
|
|
|
|
Int: {},
|
|
|
|
Float: {},
|
|
|
|
Decimal: {},
|
|
|
|
Bool: {},
|
|
|
|
Time: {},
|
|
|
|
Date: {},
|
|
|
|
Datetime: {},
|
|
|
|
},
|
2020-10-20 18:05:43 +03:00
|
|
|
mungeFns: map[Kind]MungeFunc{},
|
2020-08-23 13:42:15 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sample adds a sample to the detector.
|
|
|
|
func (d *Detector) Sample(v interface{}) {
|
|
|
|
switch v.(type) {
|
|
|
|
case nil:
|
|
|
|
// Can't glean any info from nil
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
// Don't know what this, so delete all kinds
|
|
|
|
d.retain()
|
|
|
|
return
|
|
|
|
case float32, float64:
|
|
|
|
d.retain(Float, Decimal)
|
|
|
|
return
|
|
|
|
case int, int8, int16, int32, int64:
|
|
|
|
d.retain(Int, Float, Decimal)
|
|
|
|
return
|
|
|
|
case bool:
|
|
|
|
d.retain(Bool)
|
|
|
|
return
|
|
|
|
case time.Time:
|
|
|
|
d.retain(Time, Date, Datetime)
|
2020-11-02 20:40:29 +03:00
|
|
|
return
|
2020-08-23 13:42:15 +03:00
|
|
|
case stdj.Number:
|
|
|
|
// JSON number
|
|
|
|
d.foundString = true
|
|
|
|
d.retain(Decimal)
|
|
|
|
return
|
|
|
|
case string:
|
|
|
|
// We need to do more work to figure out the kind when
|
|
|
|
// we're getting string values
|
|
|
|
d.foundString = true
|
|
|
|
}
|
|
|
|
|
|
|
|
// We're dealing with a string value, which could a variety
|
|
|
|
// of things, such as: "1", "1.0", "true", "11:30".
|
2021-09-13 01:14:30 +03:00
|
|
|
d.doSampleString(v.(string))
|
|
|
|
}
|
2020-08-23 13:42:15 +03:00
|
|
|
|
2021-09-13 01:14:30 +03:00
|
|
|
//nolint:dupl,gocognit
|
|
|
|
func (d *Detector) doSampleString(s string) {
|
2020-08-23 13:42:15 +03:00
|
|
|
if s == "" {
|
|
|
|
// Can't really do anything useful with this
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
var err error
|
|
|
|
|
|
|
|
if d.has(Decimal) {
|
|
|
|
// If Decimal is still a candidate, check that we can parse it
|
|
|
|
if _, _, err = big.ParseFloat(s, 10, 64, 0); err != nil {
|
|
|
|
// If s cannot be parsed as a decimal, it also can't
|
|
|
|
// be int or float
|
|
|
|
d.delete(Decimal, Int, Float)
|
|
|
|
} else {
|
|
|
|
// s can be parsed as decimal, can't be time
|
|
|
|
d.delete(Time, Date, Datetime)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Int) {
|
|
|
|
if _, err = strconv.ParseInt(s, 10, 64); err != nil {
|
|
|
|
d.delete(Int)
|
|
|
|
} else {
|
|
|
|
// s can be parsed as int, can't be time
|
|
|
|
d.delete(Time, Date, Datetime)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Float) {
|
|
|
|
if _, err = strconv.ParseFloat(s, 64); err != nil {
|
|
|
|
d.delete(Float)
|
|
|
|
} else {
|
|
|
|
// s can be parsed as float, can't be time
|
|
|
|
d.delete(Time, Date, Datetime)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Bool) {
|
|
|
|
if _, err = stringz.ParseBool(s); err != nil {
|
|
|
|
d.delete(Bool)
|
|
|
|
} else {
|
|
|
|
// s can be parsed as bool, can't be time,
|
|
|
|
// but still could be int ("1" == true)
|
|
|
|
d.delete(Float, Time, Date, Datetime)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Time) {
|
|
|
|
ok, format := detectKindTime(s)
|
|
|
|
if !ok {
|
|
|
|
// It's not a recognized time format
|
|
|
|
d.delete(Time)
|
|
|
|
} else {
|
|
|
|
// If it's kind.Time, it can't be anything else
|
|
|
|
d.retain(Time)
|
|
|
|
|
|
|
|
d.mungeFns[Time] = func(val interface{}) (interface{}, error) {
|
|
|
|
if val == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
2020-11-02 20:40:29 +03:00
|
|
|
s, ok = val.(string)
|
2020-08-23 13:42:15 +03:00
|
|
|
if !ok {
|
|
|
|
return nil, errz.Errorf("expected %T to be string", val)
|
|
|
|
}
|
|
|
|
|
|
|
|
if s == "" {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
2020-11-02 20:40:29 +03:00
|
|
|
var t time.Time
|
|
|
|
t, err = time.Parse(format, s)
|
2020-08-23 13:42:15 +03:00
|
|
|
if err != nil {
|
|
|
|
return nil, errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return t.Format(format), nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Date) {
|
|
|
|
ok, format := detectKindDate(s)
|
|
|
|
if !ok {
|
|
|
|
// It's not a recognized date format
|
|
|
|
d.delete(Date)
|
|
|
|
} else {
|
|
|
|
// If it's kind.Date, it can't be anything else
|
|
|
|
d.retain(Date)
|
|
|
|
|
|
|
|
d.mungeFns[Date] = func(val interface{}) (interface{}, error) {
|
|
|
|
if val == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
2020-11-02 20:40:29 +03:00
|
|
|
s, ok = val.(string)
|
2020-08-23 13:42:15 +03:00
|
|
|
if !ok {
|
|
|
|
return nil, errz.Errorf("expected %T to be string", val)
|
|
|
|
}
|
|
|
|
|
|
|
|
if s == "" {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
2020-11-02 20:40:29 +03:00
|
|
|
var t time.Time
|
|
|
|
t, err = time.Parse(format, s)
|
2020-08-23 13:42:15 +03:00
|
|
|
if err != nil {
|
|
|
|
return nil, errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return t.Format(format), nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Datetime) {
|
|
|
|
ok, format := detectKindDatetime(s)
|
|
|
|
if !ok {
|
|
|
|
// It's not a recognized datetime format
|
|
|
|
d.delete(Datetime)
|
|
|
|
} else {
|
|
|
|
// If it's kind.Datetime, it can't be anything else
|
|
|
|
d.retain(Datetime)
|
|
|
|
|
|
|
|
// This mungeFn differs from kind.Date and kind.Time in that
|
|
|
|
// it returns a time.Time instead of a string
|
|
|
|
d.mungeFns[Datetime] = func(val interface{}) (interface{}, error) {
|
|
|
|
if val == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
s, ok := val.(string)
|
|
|
|
if !ok {
|
|
|
|
return nil, errz.Errorf("expected %T to be string", val)
|
|
|
|
}
|
|
|
|
|
|
|
|
if s == "" {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
t, err := time.Parse(format, s)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return t, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-02 20:40:29 +03:00
|
|
|
// Detect returns the detected Kind. If ambiguous, Text is returned,
|
|
|
|
// unless all sampled values were nil, in which case Null is returned.
|
2020-08-23 13:42:15 +03:00
|
|
|
// If the returned mungeFn is non-nil, it can be used to convert input
|
2020-10-20 18:05:43 +03:00
|
|
|
// values to their canonical form. For example, for Datetime the MungeFunc
|
2020-08-23 13:42:15 +03:00
|
|
|
// would accept string "2020-06-11T02:50:54Z" and return a time.Time,
|
2020-10-20 18:05:43 +03:00
|
|
|
// while for Date, the MungeFunc would accept "1970-01-01" or "01 Jan 1970"
|
2020-08-23 13:42:15 +03:00
|
|
|
// and always return a string in the canonicalized form "1970-01-01".
|
2020-10-20 18:05:43 +03:00
|
|
|
func (d *Detector) Detect() (kind Kind, mungeFn MungeFunc, err error) {
|
2020-08-23 13:42:15 +03:00
|
|
|
if !d.dirty {
|
2020-11-02 20:40:29 +03:00
|
|
|
if d.foundString {
|
|
|
|
return Text, nil, nil
|
|
|
|
}
|
|
|
|
|
2020-08-23 13:42:15 +03:00
|
|
|
// If we haven't filtered any kinds, default to Text.
|
2020-11-02 20:40:29 +03:00
|
|
|
return Null, nil, nil
|
2020-08-23 13:42:15 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
switch len(d.kinds) {
|
|
|
|
case 0:
|
|
|
|
return Text, nil, nil
|
|
|
|
case 1:
|
|
|
|
for k := range d.kinds {
|
|
|
|
return k, d.mungeFns[k], nil
|
|
|
|
}
|
2020-11-02 20:40:29 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// NOTE: this logic below about detecting the remaining type
|
|
|
|
// is a bit sketchy. If you're debugging this code, it's
|
|
|
|
// probably the case that the code below is faulty.
|
|
|
|
if d.has(Time, Date, Datetime) {
|
|
|
|
// If all three time types are left, use the most
|
|
|
|
// general, i.e. Datetime.
|
|
|
|
return Datetime, d.mungeFns[Datetime], nil
|
2020-08-23 13:42:15 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Time) {
|
|
|
|
return Time, d.mungeFns[Time], nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Date) {
|
|
|
|
return Date, d.mungeFns[Date], nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Datetime) {
|
|
|
|
return Datetime, d.mungeFns[Datetime], nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.foundString && d.has(Decimal) {
|
|
|
|
return Decimal, nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Int) {
|
|
|
|
return Int, nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Float) {
|
|
|
|
return Float, nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if d.has(Bool) {
|
|
|
|
return Bool, nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
return Text, nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// delete deletes each of kinds from kd.kinds
|
|
|
|
func (d *Detector) delete(kinds ...Kind) {
|
|
|
|
d.dirty = true
|
|
|
|
for _, k := range kinds {
|
|
|
|
delete(d.kinds, k)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// retain deletes everything from kd.kinds except items
|
|
|
|
// contains in the kinds arg. If kinds is empty, kd.kinds is
|
|
|
|
// be emptied.
|
|
|
|
func (d *Detector) retain(kinds ...Kind) {
|
|
|
|
d.dirty = true
|
|
|
|
for k := range d.kinds {
|
|
|
|
if !containsKind(k, kinds...) {
|
|
|
|
delete(d.kinds, k)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// has returns true if kd.kinds contains each of k.
|
|
|
|
func (d *Detector) has(kinds ...Kind) bool {
|
|
|
|
var ok bool
|
|
|
|
for _, k := range kinds {
|
|
|
|
if _, ok = d.kinds[k]; !ok {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
func detectKindTime(s string) (ok bool, format string) {
|
|
|
|
if s == "" {
|
|
|
|
return false, ""
|
|
|
|
}
|
|
|
|
|
|
|
|
const timeNoSecsFormat = "15:04"
|
|
|
|
formats := []string{stringz.TimeFormat, timeNoSecsFormat, time.Kitchen}
|
|
|
|
var err error
|
|
|
|
|
|
|
|
for _, f := range formats {
|
|
|
|
if _, err = time.Parse(f, s); err == nil {
|
|
|
|
return true, f
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false, ""
|
|
|
|
}
|
|
|
|
|
|
|
|
func detectKindDate(s string) (ok bool, format string) {
|
|
|
|
if s == "" {
|
|
|
|
return false, ""
|
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
|
|
|
format1 = "02 Jan 2006"
|
|
|
|
format2 = "2006/01/02"
|
|
|
|
format3 = "2006-01-02"
|
|
|
|
)
|
|
|
|
|
|
|
|
formats := []string{stringz.DateFormat, format1, format2, format3}
|
|
|
|
var err error
|
|
|
|
|
|
|
|
for _, f := range formats {
|
|
|
|
if _, err = time.Parse(f, s); err == nil {
|
|
|
|
return true, f
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false, ""
|
|
|
|
}
|
|
|
|
|
|
|
|
func detectKindDatetime(s string) (ok bool, format string) {
|
|
|
|
if s == "" {
|
|
|
|
return false, ""
|
|
|
|
}
|
|
|
|
|
|
|
|
formats := []string{
|
|
|
|
stringz.DatetimeFormat, // RFC3339Nano
|
|
|
|
time.ANSIC,
|
|
|
|
time.UnixDate,
|
|
|
|
time.RubyDate,
|
|
|
|
time.RFC822,
|
|
|
|
time.RFC822Z,
|
|
|
|
time.RFC850,
|
|
|
|
time.RFC1123,
|
|
|
|
time.RFC1123Z,
|
|
|
|
time.RFC3339,
|
|
|
|
time.Stamp,
|
|
|
|
time.StampMilli,
|
|
|
|
time.StampMicro,
|
|
|
|
time.StampNano,
|
|
|
|
}
|
|
|
|
var err error
|
|
|
|
|
|
|
|
for _, f := range formats {
|
|
|
|
if _, err = time.Parse(f, s); err == nil {
|
|
|
|
return true, f
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false, ""
|
|
|
|
}
|
|
|
|
|
|
|
|
func containsKind(needle Kind, haystack ...Kind) bool {
|
|
|
|
for _, k := range haystack {
|
|
|
|
if k == needle {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|