mirror of
https://github.com/neilotoole/sq.git
synced 2024-11-28 03:53:07 +03:00
3f6157c4c4
- Switch to slog logger.
503 lines
13 KiB
Go
503 lines
13 KiB
Go
package csv
|
|
|
|
import (
|
|
"context"
|
|
"encoding/csv"
|
|
"errors"
|
|
"io"
|
|
"strconv"
|
|
"unicode/utf8"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/lg/lga"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/lg/lgm"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/lg"
|
|
|
|
"github.com/shopspring/decimal"
|
|
|
|
"github.com/neilotoole/sq/libsq"
|
|
"github.com/neilotoole/sq/libsq/core/errz"
|
|
"github.com/neilotoole/sq/libsq/core/kind"
|
|
"github.com/neilotoole/sq/libsq/core/options"
|
|
"github.com/neilotoole/sq/libsq/core/sqlmodel"
|
|
"github.com/neilotoole/sq/libsq/core/sqlz"
|
|
"github.com/neilotoole/sq/libsq/core/stringz"
|
|
"github.com/neilotoole/sq/libsq/driver"
|
|
"github.com/neilotoole/sq/libsq/source"
|
|
)
|
|
|
|
const (
|
|
readAheadBufferSize = 100
|
|
)
|
|
|
|
// importCSV loads the src CSV data to scratchDB.
|
|
func importCSV(ctx context.Context, src *source.Source,
|
|
openFn source.FileOpenFunc, scratchDB driver.Database,
|
|
) error {
|
|
log := lg.FromContext(ctx)
|
|
// TODO: optPredictKind should be read from src.Options.
|
|
const optPredictKind bool = true
|
|
|
|
var err error
|
|
var r io.ReadCloser
|
|
|
|
r, err = openFn()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
defer lg.WarnIfCloseError(log, lgm.CloseFileReader, r)
|
|
|
|
// We add the CR filter reader to deal with CSV files exported
|
|
// from Excel which can have the DOS-style \r EOL markers.
|
|
cr := csv.NewReader(&crFilterReader{r: r})
|
|
cr.Comma, err = getDelimiter(src)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// readAheadRecs temporarily holds records read from r for the purpose
|
|
// of determining CSV metadata such as column headers, data kind etc.
|
|
// These records will later be written to recordCh.
|
|
readAheadRecs := make([][]string, 0, readAheadBufferSize)
|
|
|
|
colNames, err := getColNames(cr, src, &readAheadRecs)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
expectFieldCount := len(colNames)
|
|
|
|
var colKinds []kind.Kind
|
|
if optPredictKind {
|
|
colKinds, err = predictColKinds(expectFieldCount, cr, &readAheadRecs, readAheadBufferSize)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
// If we're not predicting col kind, then we use kind.Text.
|
|
colKinds = make([]kind.Kind, expectFieldCount)
|
|
for i := range colKinds {
|
|
colKinds[i] = kind.Text
|
|
}
|
|
}
|
|
|
|
// And now we need to create the dest table in scratchDB
|
|
tblDef := createTblDef(source.MonotableName, colNames, colKinds)
|
|
|
|
err = scratchDB.SQLDriver().CreateTable(ctx, scratchDB.DB(), tblDef)
|
|
if err != nil {
|
|
return errz.Wrap(err, "csv: failed to create dest scratch table")
|
|
}
|
|
|
|
recMeta, err := getRecMeta(ctx, scratchDB, tblDef)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
insertWriter := libsq.NewDBWriter(log, scratchDB, tblDef.Name, driver.Tuning.RecordChSize)
|
|
err = execInsert(ctx, insertWriter, recMeta, readAheadRecs, cr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
inserted, err := insertWriter.Wait()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
log.Debug("Inserted rows",
|
|
lga.Count, inserted,
|
|
lga.Target, source.Target(scratchDB.Source(), tblDef.Name),
|
|
)
|
|
return nil
|
|
}
|
|
|
|
// execInsert inserts the CSV records in readAheadRecs (followed by records
|
|
// from the csv.Reader) via recw. The caller should wait on recw to complete.
|
|
func execInsert(ctx context.Context, recw libsq.RecordWriter, recMeta sqlz.RecordMeta,
|
|
readAheadRecs [][]string, r *csv.Reader,
|
|
) error {
|
|
ctx, cancelFn := context.WithCancel(ctx)
|
|
|
|
recordCh, errCh, err := recw.Open(ctx, cancelFn, recMeta)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer close(recordCh)
|
|
|
|
// Before we continue reading from CSV, we first write out
|
|
// any CSV records we read earlier.
|
|
for i := range readAheadRecs {
|
|
rec := mungeCSV2InsertRecord(readAheadRecs[i])
|
|
|
|
select {
|
|
case err = <-errCh:
|
|
cancelFn()
|
|
return err
|
|
case <-ctx.Done():
|
|
cancelFn()
|
|
return ctx.Err()
|
|
case recordCh <- rec:
|
|
}
|
|
}
|
|
|
|
var csvRecord []string
|
|
for {
|
|
csvRecord, err = r.Read()
|
|
if errors.Is(err, io.EOF) {
|
|
// We're done reading
|
|
return nil
|
|
}
|
|
if err != nil {
|
|
cancelFn()
|
|
return errz.Wrap(err, "read from CSV data source")
|
|
}
|
|
|
|
rec := mungeCSV2InsertRecord(csvRecord)
|
|
|
|
select {
|
|
case err = <-errCh:
|
|
cancelFn()
|
|
return err
|
|
case <-ctx.Done():
|
|
cancelFn()
|
|
return ctx.Err()
|
|
case recordCh <- rec:
|
|
}
|
|
}
|
|
}
|
|
|
|
// mungeCSV2InsertRecord returns a new []any containing
|
|
// the values of the csvRec []string.
|
|
func mungeCSV2InsertRecord(csvRec []string) []any {
|
|
a := make([]any, len(csvRec))
|
|
for i := range csvRec {
|
|
a[i] = csvRec[i]
|
|
}
|
|
return a
|
|
}
|
|
|
|
// getRecMeta returns RecordMeta to use with RecordWriter.Open.
|
|
func getRecMeta(ctx context.Context, scratchDB driver.Database, tblDef *sqlmodel.TableDef) (sqlz.RecordMeta, error) {
|
|
colTypes, err := scratchDB.SQLDriver().TableColumnTypes(ctx, scratchDB.DB(), tblDef.Name, tblDef.ColNames())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
destMeta, _, err := scratchDB.SQLDriver().RecordMeta(colTypes)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return destMeta, nil
|
|
}
|
|
|
|
func createTblDef(tblName string, colNames []string, kinds []kind.Kind) *sqlmodel.TableDef {
|
|
tbl := &sqlmodel.TableDef{Name: tblName}
|
|
|
|
cols := make([]*sqlmodel.ColDef, len(colNames))
|
|
for i := range colNames {
|
|
cols[i] = &sqlmodel.ColDef{Table: tbl, Name: colNames[i], Kind: kinds[i]}
|
|
}
|
|
|
|
tbl.Cols = cols
|
|
return tbl
|
|
}
|
|
|
|
// predictColKinds examines up to maxExamine records in readAheadRecs
|
|
// and those returned by r to guess the kind of each field.
|
|
// Any additional records read from r are appended to readAheadRecs.
|
|
//
|
|
// This func considers these candidate kinds, in order of
|
|
// precedence: kind.Int, kind.Bool, kind.Decimal.
|
|
//
|
|
// kind.Decimal is chosen over kind.Float due to its greater flexibility.
|
|
// NOTE: Currently kind.Time and kind.Datetime are not examined.
|
|
//
|
|
// If any field (string) value cannot be parsed into a particular kind, that
|
|
// kind is excluded from the list of candidate kinds. The first of any
|
|
// remaining candidate kinds for each field is returned, or kind.Text if
|
|
// no candidate kinds.
|
|
func predictColKinds(expectFieldCount int, r *csv.Reader, readAheadRecs *[][]string, maxExamine int) ([]kind.Kind,
|
|
error,
|
|
) {
|
|
// FIXME: [legacy] this function should switch to using kind.Detector
|
|
|
|
candidateKinds := newCandidateFieldKinds(expectFieldCount)
|
|
var examineCount int
|
|
|
|
// First, read any records from the readAheadRecs buffer
|
|
for recIndex := 0; recIndex < len(*readAheadRecs) && examineCount < maxExamine; recIndex++ {
|
|
for fieldIndex := 0; fieldIndex < expectFieldCount; fieldIndex++ {
|
|
candidateKinds[fieldIndex] = excludeFieldKinds(candidateKinds[fieldIndex],
|
|
(*readAheadRecs)[recIndex][fieldIndex])
|
|
}
|
|
examineCount++
|
|
}
|
|
|
|
// Next, continue to read from r until we reach maxExamine records.
|
|
for ; examineCount < maxExamine; examineCount++ {
|
|
rec, err := r.Read()
|
|
if err == io.EOF { //nolint:errorlint
|
|
break
|
|
}
|
|
if err != nil {
|
|
return nil, errz.Err(err)
|
|
}
|
|
|
|
if len(rec) != expectFieldCount {
|
|
// safety check
|
|
return nil, errz.Errorf("expected %d fields in CSV record but got %d", examineCount, len(rec))
|
|
}
|
|
|
|
for fieldIndex, fieldValue := range rec {
|
|
candidateKinds[fieldIndex] = excludeFieldKinds(candidateKinds[fieldIndex], fieldValue)
|
|
}
|
|
|
|
// Add the recently read record to readAheadRecs so that
|
|
// it's not lost.
|
|
*readAheadRecs = append(*readAheadRecs, rec)
|
|
}
|
|
|
|
resultKinds := make([]kind.Kind, expectFieldCount)
|
|
for i := range resultKinds {
|
|
switch len(candidateKinds[i]) {
|
|
case 0:
|
|
// If all candidate kinds have been excluded, kind.Text is
|
|
// the fallback option.
|
|
resultKinds[i] = kind.Text
|
|
default:
|
|
// If there's one or more candidate kinds remaining, pick the first
|
|
// one available, as it should be the most specific kind.
|
|
resultKinds[i] = candidateKinds[i][0]
|
|
}
|
|
}
|
|
return resultKinds, nil
|
|
}
|
|
|
|
// newCandidateFieldKinds returns a new slice of kind.Kind containing
|
|
// potential kinds for a field/column. The kinds are in an order of
|
|
// precedence.
|
|
func newCandidateFieldKinds(n int) [][]kind.Kind {
|
|
kinds := make([][]kind.Kind, n)
|
|
for i := range kinds {
|
|
k := []kind.Kind{
|
|
kind.Int,
|
|
kind.Bool,
|
|
kind.Decimal,
|
|
}
|
|
kinds[i] = k
|
|
}
|
|
|
|
return kinds
|
|
}
|
|
|
|
// excludeFieldKinds returns a filter of fieldCandidateKinds, removing those
|
|
// kinds which fieldVal cannot be converted to.
|
|
func excludeFieldKinds(fieldCandidateKinds []kind.Kind, fieldVal string) []kind.Kind {
|
|
var resultCandidateKinds []kind.Kind
|
|
|
|
if fieldVal == "" {
|
|
// If the field is empty string, this could indicate a NULL value
|
|
// for any kind. That is, we don't exclude a candidate kind due
|
|
// to empty string.
|
|
return fieldCandidateKinds
|
|
}
|
|
|
|
for _, knd := range fieldCandidateKinds {
|
|
var err error
|
|
|
|
switch knd { //nolint:exhaustive
|
|
case kind.Int:
|
|
_, err = strconv.Atoi(fieldVal)
|
|
case kind.Bool:
|
|
_, err = strconv.ParseBool(fieldVal)
|
|
case kind.Decimal:
|
|
_, err = decimal.NewFromString(fieldVal)
|
|
default:
|
|
}
|
|
|
|
if err == nil {
|
|
resultCandidateKinds = append(resultCandidateKinds, knd)
|
|
}
|
|
}
|
|
|
|
return resultCandidateKinds
|
|
}
|
|
|
|
// getColNames determines column names. The col names are determined
|
|
// as follows:
|
|
//
|
|
// - Col names can be explicitly specified in src.Options
|
|
// - If the source CSV has a header record, the fields of the
|
|
// header record are returned.
|
|
// - Otherwise, the first data record is read, and col names are generated
|
|
// based on the number of fields [A,B,C...] etc. That first data record
|
|
// is appended to readAheadRecs so that it's not lost.
|
|
//
|
|
// Note that cr must not have been previously read.
|
|
func getColNames(cr *csv.Reader, src *source.Source, readAheadRecs *[][]string) ([]string, error) {
|
|
// If col names are explicitly provided in opts, we
|
|
// will be returning them.
|
|
explicitColNames, err := options.GetColNames(src.Options)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
optHasHeaderRecord, _, err := options.HasHeader(src.Options)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if optHasHeaderRecord {
|
|
// The CSV file has a header record, we need to consume it.
|
|
var headerRec []string
|
|
headerRec, err = cr.Read()
|
|
if err == io.EOF { //nolint:errorlint
|
|
return nil, errz.Errorf("data source %s has no data", src.Handle)
|
|
}
|
|
if err != nil {
|
|
return nil, errz.Err(err)
|
|
}
|
|
|
|
if len(explicitColNames) > 0 {
|
|
// If col names were explicitly specified via options, return
|
|
// those col names, as explicit option col names have precedence
|
|
// over the header record col names.
|
|
return explicitColNames, nil
|
|
}
|
|
|
|
// Otherwise return the header record col names.
|
|
return headerRec, nil
|
|
}
|
|
|
|
// Read ahead the first record. We need this to determine the number
|
|
// of columns.
|
|
firstRec, err := cr.Read()
|
|
if err == io.EOF { //nolint:errorlint
|
|
return nil, errz.Errorf("data source %s is empty", src.Handle)
|
|
}
|
|
if err != nil {
|
|
return nil, errz.Wrapf(err, "read from data source %s", src.Handle)
|
|
}
|
|
|
|
// firstRec contains actual data, so append it to readAheadRecs.
|
|
*readAheadRecs = append(*readAheadRecs, firstRec)
|
|
|
|
// If we have explicit column names, we still need to verify the
|
|
// column name count against the data.
|
|
if len(explicitColNames) > 0 {
|
|
if len(explicitColNames) != len(firstRec) {
|
|
return nil, errz.Errorf("mismatch: source has %d explicit column names specified, but first data record has %d fields", //nolint:lll
|
|
len(explicitColNames), len(firstRec))
|
|
}
|
|
|
|
return explicitColNames, nil
|
|
}
|
|
|
|
// The CSV file does not have a header record. We will generate
|
|
// col names [A,B,C...]. To do so, we need to know how many fields
|
|
// there are in the first record.
|
|
// If no column names yet, we generate them based on the number
|
|
// of fields in firstRec.
|
|
generatedColNames := make([]string, len(firstRec))
|
|
for i := range firstRec {
|
|
generatedColNames[i] = stringz.GenerateAlphaColName(i, false)
|
|
}
|
|
|
|
return generatedColNames, nil
|
|
}
|
|
|
|
// namedDelimiters is map of named delimiter strings to
|
|
// rune value. For example, "comma" maps to ',' and "pipe" maps to '|'.
|
|
var namedDelimiters = map[string]rune{
|
|
"comma": ',',
|
|
"space": ' ',
|
|
"pipe": '|',
|
|
"tab": '\t',
|
|
"colon": ':',
|
|
"semi": ';',
|
|
"period": '.',
|
|
}
|
|
|
|
// getDelimiter returns the delimiter for src. An explicit
|
|
// delimiter value may be set in src.Options; otherwise
|
|
// the default for the source is returned.
|
|
func getDelimiter(src *source.Source) (rune, error) {
|
|
delim, ok, err := getDelimFromOptions(src.Options)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
if ok {
|
|
return delim, nil
|
|
}
|
|
|
|
if src.Type == TypeTSV {
|
|
return '\t', nil
|
|
}
|
|
|
|
// default is comma
|
|
return ',', nil
|
|
}
|
|
|
|
// getDelimFromOptions returns ok as true and the delimiter rune if a
|
|
// valid value is provided in src.Options, returns ok as false if
|
|
// no valid value provided, and an error if the provided value is invalid.
|
|
func getDelimFromOptions(opts options.Options) (r rune, ok bool, err error) {
|
|
if len(opts) == 0 {
|
|
return 0, false, nil
|
|
}
|
|
|
|
_, ok = opts[options.OptDelim]
|
|
if !ok {
|
|
return 0, false, nil
|
|
}
|
|
|
|
val := opts.Get(options.OptDelim)
|
|
if val == "" {
|
|
return 0, false, nil
|
|
}
|
|
|
|
if len(val) == 1 {
|
|
r, _ = utf8.DecodeRuneInString(val)
|
|
return r, true, nil
|
|
}
|
|
|
|
r, ok = namedDelimiters[val]
|
|
|
|
if !ok {
|
|
err = errz.Errorf("unknown delimiter constant {%s}", val)
|
|
return 0, false, err
|
|
}
|
|
|
|
return r, true, nil
|
|
}
|
|
|
|
// crFilterReader is a reader whose Read method converts
|
|
// standalone carriage return '\r' bytes to newline '\n'.
|
|
// CRLF "\r\n" sequences are untouched.
|
|
// This is useful for reading from DOS format, e.g. a TSV
|
|
// file exported by Microsoft Excel.
|
|
type crFilterReader struct {
|
|
r io.Reader
|
|
}
|
|
|
|
func (r *crFilterReader) Read(p []byte) (n int, err error) {
|
|
n, err = r.r.Read(p)
|
|
|
|
for i := 0; i < n; i++ {
|
|
if p[i] == 13 {
|
|
if i+1 < n && p[i+1] == 10 {
|
|
continue // it's \r\n
|
|
}
|
|
// it's just \r by itself, replace
|
|
p[i] = 10
|
|
}
|
|
}
|
|
|
|
return n, err
|
|
}
|