sq/drivers/csv/ingest.go

package csv

import (
	"context"
	"encoding/csv"
	"errors"
	"io"
	"unicode/utf8"

	"github.com/neilotoole/sq/drivers"

	"github.com/neilotoole/sq/libsq/core/kind"
	"github.com/neilotoole/sq/libsq/core/sqlz"

	"github.com/neilotoole/sq/libsq/core/stringz"

	"github.com/neilotoole/sq/libsq/core/lg/lga"

	"github.com/neilotoole/sq/libsq/core/lg/lgm"

	"github.com/neilotoole/sq/libsq/core/lg"

	"github.com/neilotoole/sq/libsq"
	"github.com/neilotoole/sq/libsq/core/errz"
	"github.com/neilotoole/sq/libsq/core/options"
	"github.com/neilotoole/sq/libsq/driver"
	"github.com/neilotoole/sq/libsq/source"
)

// OptEmptyAsNull determines if an empty CSV field is treated as NULL
// or as the zero value for the kind of that field.
var OptEmptyAsNull = options.NewBool(
	"driver.csv.empty-as-null",
	0,
	true,
	"Treat ingest empty CSV fields as NULL",
	`When true, empty CSV fields are treated as NULL. When false,
the zero value for that type is used, e.g. empty string or 0.`,
	"source",
	"csv",
)

// OptDelim specifies the CSV delimiter to use.
var OptDelim = options.NewString(
	"driver.csv.delim",
	0,
	delimCommaKey,
	"Delimiter for ingest CSV data",
	`Delimiter to use for CSV files. Default is "comma".
Possible values are: comma, space, pipe, tab, colon, semi, period.`,
	"source",
	"csv",
)

// ingestCSV loads the src CSV data into scratchDB.
func ingestCSV(ctx context.Context, src *source.Source, openFn source.FileOpenFunc, scratchDB driver.Database) error {
	log := lg.FromContext(ctx)

	var err error
	var r io.ReadCloser

	r, err = openFn()
	if err != nil {
		return err
	}
	defer lg.WarnIfCloseError(log, lgm.CloseFileReader, r)

	delim, err := getDelimiter(src)
	if err != nil {
		return err
	}

	cr := newCSVReader(r, delim)
	recs, err := readRecords(cr, drivers.OptIngestSampleSize.Get(src.Options))
	if err != nil {
		return err
	}

	headerPresent, err := hasHeaderRow(ctx, recs, src.Options)
	if err != nil {
		return err
	}

	var header []string
	if headerPresent {
		header = recs[0]

		// We're done with the first row
		recs = recs[1:]
	} else {
		// The CSV file does not have a header record. We will generate
		// col names [A,B,C...].
		header = make([]string, len(recs[0]))
		for i := range recs[0] {
			header[i] = stringz.GenerateAlphaColName(i, false)
		}
	}

	kinds, mungers, err := detectColKinds(recs)
	if err != nil {
		return err
	}

	// And now we need to create the dest table in scratchDB
	tblDef := createTblDef(source.MonotableName, header, kinds)

	err = scratchDB.SQLDriver().CreateTable(ctx, scratchDB.DB(), tblDef)
	if err != nil {
		return errz.Wrap(err, "csv: failed to create dest scratch table")
	}

	recMeta, err := getRecMeta(ctx, scratchDB, tblDef)
	if err != nil {
		return err
	}

	if OptEmptyAsNull.Get(src.Options) {
		configureEmptyNullMunge(mungers, recMeta)
	}

	insertWriter := libsq.NewDBWriter(
		scratchDB,
		tblDef.Name,
		driver.OptTuningRecChanSize.Get(scratchDB.Source().Options),
	)
	err = execInsert(ctx, insertWriter, recMeta, mungers, recs, cr)
	if err != nil {
		return err
	}

	inserted, err := insertWriter.Wait()
	if err != nil {
		return err
	}

	log.Debug("Inserted rows",
		lga.Count, inserted,
		lga.Target, source.Target(scratchDB.Source(), tblDef.Name),
	)
	return nil
}

// configureEmptyNullMunge configures mungers to that empty string is
// munged to nil.
func configureEmptyNullMunge(mungers []kind.MungeFunc, recMeta sqlz.RecordMeta) {
	kinds := recMeta.Kinds()
	for i := range mungers {
		if kinds[i] == kind.Text {
			if mungers[i] == nil {
				mungers[i] = kind.MungeEmptyStringAsNil
				continue
			}

			// There's already a munger: wrap it
			existing := mungers[i]
			mungers[i] = func(v any) (any, error) {
				var err error
				v, err = existing(v)
				if err != nil {
					return v, err
				}

				return kind.MungeEmptyStringAsNil(v)
			}
		}
	}
}

const (
	delimCommaKey  = "comma"
	delimComma     = ','
	delimSpaceKey  = "space"
	delimSpace     = ' '
	delimPipeKey   = "pipe"
	delimPipe      = '|'
	delimTabKey    = "tab"
	delimTab       = '\t'
	delimColonKey  = "colon"
	delimColon     = ':'
	delimSemiKey   = "semi"
	delimSemi      = ';'
	delimPeriodKey = "period"
	delimPeriod    = '.'
)

// NamedDelims returns the named delimiters, such as [comma, tab, pipe...].
func NamedDelims() []string {
	return []string{
		delimCommaKey,
		delimTabKey,
		delimSemiKey,
		delimColonKey,
		delimSpaceKey,
		delimPipeKey,
		delimPeriodKey,
	}
}

// namedDelimiters is map of named delimiter strings to
// rune value. For example, "comma" maps to ',' and "pipe" maps to '|'.
var namedDelimiters = map[string]rune{
	delimCommaKey:  delimComma,
	delimSpaceKey:  delimSpace,
	delimPipeKey:   delimPipe,
	delimTabKey:    delimTab,
	delimColonKey:  delimColon,
	delimSemiKey:   delimSemi,
	delimPeriodKey: delimPeriod,
}

// getDelimiter returns the delimiter for src. An explicit
// delimiter value may be set in src.Options; otherwise
// the default for the source is returned.
func getDelimiter(src *source.Source) (rune, error) {
	delim, ok, err := getDelimFromOptions(src.Options)
	if err != nil {
		return 0, err
	}

	if ok {
		return delim, nil
	}

	if src.Type == TypeTSV {
		return '\t', nil
	}

	// default is comma
	return ',', nil
}

// getDelimFromOptions returns ok as true and the delimiter rune if a
// valid value is provided in src.Options, returns ok as false if
// no valid value provided, and an error if the provided value is invalid.
func getDelimFromOptions(opts options.Options) (r rune, ok bool, err error) {
	if len(opts) == 0 {
		return 0, false, nil
	}

	val := OptDelim.Get(opts)

	if len(val) == 1 {
		r, _ = utf8.DecodeRuneInString(val)
		return r, true, nil
	}

	r, ok = namedDelimiters[val]

	if !ok {
		err = errz.Errorf("unknown delimiter constant {%s}", val)
		return 0, false, err
	}

	return r, true, nil
}

func newCSVReader(r io.Reader, delim rune) *csv.Reader {
	// We add the CR filter reader to deal with CSV files exported
	// from Excel which can have the DOS-style \r EOL markers.
	cr := csv.NewReader(&crFilterReader{r: r})
	cr.Comma = delim
	return cr
}

// crFilterReader is a reader whose Read method converts
// standalone carriage return '\r' bytes to newline '\n'.
// CRLF "\r\n" sequences are untouched.
// This is useful for reading from DOS format, e.g. a TSV
// file exported by Microsoft Excel.
type crFilterReader struct {
	r io.Reader
}

func (r *crFilterReader) Read(p []byte) (n int, err error) {
	n, err = r.r.Read(p)

	for i := 0; i < n; i++ {
		if p[i] == 13 {
			if i+1 < n && p[i+1] == 10 {
				continue // it's \r\n
			}
			// it's just \r by itself, replace
			p[i] = 10
		}
	}

	return n, err
}

// readRecords reads a maximum of n records from cr.
func readRecords(cr *csv.Reader, n int) ([][]string, error) {
	recs := make([][]string, 0, n)

	for i := 0; i < n; i++ {
		rec, err := cr.Read()
		if err != nil {
			if errors.Is(err, io.EOF) {
				return recs, nil
			}

			return nil, errz.Err(err)
		}
		recs = append(recs, rec)
	}

	return recs, nil
}
codebase refactor 2020-08-06 20:58:47 +03:00			`package csv`

			`import (`
			`"context"`
			`"encoding/csv"`
Errors linting (#117) * errors linting * errors linting * formatting 2022-12-18 07:31:06 +03:00			`"errors"`
codebase refactor 2020-08-06 20:58:47 +03:00			`"io"`
			`"unicode/utf8"`

#199: Config overhaul (#214) * refactor: partially moved over driver.Tuning params to options * All knobs moved to options * sq config edit: now has comments for options * Major work complete on config/options overhaul * Major work complete on config/options overhaul * Updated help text for 'sq version' 2023-05-03 15:36:10 +03:00			`"github.com/neilotoole/sq/drivers"`

CSV empty string now treated as NULL (#190) 2023-04-10 04:29:13 +03:00			`"github.com/neilotoole/sq/libsq/core/kind"`
			`"github.com/neilotoole/sq/libsq/core/sqlz"`

Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`"github.com/neilotoole/sq/libsq/core/stringz"`

Change logging library to slog (#175) - Switch to slog logger. 2023-04-02 22:49:45 +03:00			`"github.com/neilotoole/sq/libsq/core/lg/lga"`

			`"github.com/neilotoole/sq/libsq/core/lg/lgm"`

			`"github.com/neilotoole/sq/libsq/core/lg"`

codebase refactor 2020-08-06 20:58:47 +03:00			`"github.com/neilotoole/sq/libsq"`
Json driver; refactoring of core packages (#66) * Type Detector refactor * json driver impl; refactoring of source.Files reader et al * working on kind detector * significant switcheroo of packages * partway throught refactoring Kind * major package switcheroo for Kind 2020-08-23 13:42:15 +03:00			`"github.com/neilotoole/sq/libsq/core/errz"`
			`"github.com/neilotoole/sq/libsq/core/options"`
codebase refactor 2020-08-06 20:58:47 +03:00			`"github.com/neilotoole/sq/libsq/driver"`
			`"github.com/neilotoole/sq/libsq/source"`
			`)`

Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00			`// OptEmptyAsNull determines if an empty CSV field is treated as NULL`
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`// or as the zero value for the kind of that field.`
Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00			`var OptEmptyAsNull = options.NewBool(`
			`"driver.csv.empty-as-null",`
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`0,`
Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00			`true,`
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`"Treat ingest empty CSV fields as NULL",`
			`When true, empty CSV fields are treated as NULL. When false,
#199: Config overhaul (#214) * refactor: partially moved over driver.Tuning params to options * All knobs moved to options * sq config edit: now has comments for options * Major work complete on config/options overhaul * Major work complete on config/options overhaul * Updated help text for 'sq version' 2023-05-03 15:36:10 +03:00			the zero value for that type is used, e.g. empty string or 0.`,
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`"source",`
			`"csv",`
Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00			`)`

			`// OptDelim specifies the CSV delimiter to use.`
			`var OptDelim = options.NewString(`
			`"driver.csv.delim",`
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`0,`
			`delimCommaKey,`
			`"Delimiter for ingest CSV data",`
#199: Config overhaul (#214) * refactor: partially moved over driver.Tuning params to options * All knobs moved to options * sq config edit: now has comments for options * Major work complete on config/options overhaul * Major work complete on config/options overhaul * Updated help text for 'sq version' 2023-05-03 15:36:10 +03:00			`Delimiter to use for CSV files. Default is "comma".
			Possible values are: comma, space, pipe, tab, colon, semi, period.`,
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`"source",`
			`"csv",`
Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00			`)`
codebase refactor 2020-08-06 20:58:47 +03:00
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`// ingestCSV loads the src CSV data into scratchDB.`
			`func ingestCSV(ctx context.Context, src *source.Source, openFn source.FileOpenFunc, scratchDB driver.Database) error {`
#199: Config overhaul (#214) * refactor: partially moved over driver.Tuning params to options * All knobs moved to options * sq config edit: now has comments for options * Major work complete on config/options overhaul * Major work complete on config/options overhaul * Updated help text for 'sq version' 2023-05-03 15:36:10 +03:00			`log := lg.FromContext(ctx)`
codebase refactor 2020-08-06 20:58:47 +03:00
Json driver; refactoring of core packages (#66) * Type Detector refactor * json driver impl; refactoring of source.Files reader et al * working on kind detector * significant switcheroo of packages * partway throught refactoring Kind * major package switcheroo for Kind 2020-08-23 13:42:15 +03:00			`var err error`
			`var r io.ReadCloser`

			`r, err = openFn()`
			`if err != nil {`
			`return err`
			`}`
Change logging library to slog (#175) - Switch to slog logger. 2023-04-02 22:49:45 +03:00			`defer lg.WarnIfCloseError(log, lgm.CloseFileReader, r)`
Fixed issues with files and databases not being closed correctly (#73) * fiddling with scratch database close order * files debugging * files debugging2 * files debugging3 * files debugging 4 * files debugging 5 * didn't close the ReadCloser in csv import * more closing cleanup 2020-12-30 21:57:58 +03:00
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`delim, err := getDelimiter(src)`
codebase refactor 2020-08-06 20:58:47 +03:00			`if err != nil {`
			`return err`
			`}`

Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`cr := newCSVReader(r, delim)`
#199: Config overhaul (#214) * refactor: partially moved over driver.Tuning params to options * All knobs moved to options * sq config edit: now has comments for options * Major work complete on config/options overhaul * Major work complete on config/options overhaul * Updated help text for 'sq version' 2023-05-03 15:36:10 +03:00			`recs, err := readRecords(cr, drivers.OptIngestSampleSize.Get(src.Options))`
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`if err != nil {`
			`return err`
			`}`
codebase refactor 2020-08-06 20:58:47 +03:00
#217: Configurable timestamp format (#218) * Moved time functions from pkg stringz to pkg timez * Refactor options.Opt * wip: initial work on configurable time layout * wip: most printers now respect format.datetime and friends * Folded pkg timefmt into timez * Refactor options.Opt; refine options * Add 'sq config set OPTION --help' mechanism * Finished completion of OptDateFormatAsNumber and OptTimeFormatAsNumber 2023-05-07 05:36:34 +03:00			`headerPresent, err := hasHeaderRow(ctx, recs, src.Options)`
codebase refactor 2020-08-06 20:58:47 +03:00			`if err != nil {`
			`return err`
			`}`

Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`var header []string`
			`if headerPresent {`
			`header = recs[0]`
codebase refactor 2020-08-06 20:58:47 +03:00
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`// We're done with the first row`
			`recs = recs[1:]`
codebase refactor 2020-08-06 20:58:47 +03:00			`} else {`
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`// The CSV file does not have a header record. We will generate`
			`// col names [A,B,C...].`
			`header = make([]string, len(recs[0]))`
			`for i := range recs[0] {`
			`header[i] = stringz.GenerateAlphaColName(i, false)`
codebase refactor 2020-08-06 20:58:47 +03:00			`}`
			`}`

CSV empty string now treated as NULL (#190) 2023-04-10 04:29:13 +03:00			`kinds, mungers, err := detectColKinds(recs)`
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`if err != nil {`
			`return err`
			`}`

codebase refactor 2020-08-06 20:58:47 +03:00			`// And now we need to create the dest table in scratchDB`
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`tblDef := createTblDef(source.MonotableName, header, kinds)`
codebase refactor 2020-08-06 20:58:47 +03:00
			`err = scratchDB.SQLDriver().CreateTable(ctx, scratchDB.DB(), tblDef)`
			`if err != nil {`
			`return errz.Wrap(err, "csv: failed to create dest scratch table")`
			`}`

			`recMeta, err := getRecMeta(ctx, scratchDB, tblDef)`
			`if err != nil {`
			`return err`
			`}`

Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00			`if OptEmptyAsNull.Get(src.Options) {`
CSV empty string now treated as NULL (#190) 2023-04-10 04:29:13 +03:00			`configureEmptyNullMunge(mungers, recMeta)`
			`}`

#199: Config overhaul (#214) * refactor: partially moved over driver.Tuning params to options * All knobs moved to options * sq config edit: now has comments for options * Major work complete on config/options overhaul * Major work complete on config/options overhaul * Updated help text for 'sq version' 2023-05-03 15:36:10 +03:00			`insertWriter := libsq.NewDBWriter(`
			`scratchDB,`
			`tblDef.Name,`
			`driver.OptTuningRecChanSize.Get(scratchDB.Source().Options),`
			`)`
CSV empty string now treated as NULL (#190) 2023-04-10 04:29:13 +03:00			`err = execInsert(ctx, insertWriter, recMeta, mungers, recs, cr)`
codebase refactor 2020-08-06 20:58:47 +03:00			`if err != nil {`
			`return err`
			`}`

			`inserted, err := insertWriter.Wait()`
			`if err != nil {`
			`return err`
			`}`

Change logging library to slog (#175) - Switch to slog logger. 2023-04-02 22:49:45 +03:00			`log.Debug("Inserted rows",`
			`lga.Count, inserted,`
			`lga.Target, source.Target(scratchDB.Source(), tblDef.Name),`
			`)`
codebase refactor 2020-08-06 20:58:47 +03:00			`return nil`
			`}`

CSV empty string now treated as NULL (#190) 2023-04-10 04:29:13 +03:00			`// configureEmptyNullMunge configures mungers to that empty string is`
			`// munged to nil.`
			`func configureEmptyNullMunge(mungers []kind.MungeFunc, recMeta sqlz.RecordMeta) {`
			`kinds := recMeta.Kinds()`
			`for i := range mungers {`
			`if kinds[i] == kind.Text {`
			`if mungers[i] == nil {`
			`mungers[i] = kind.MungeEmptyStringAsNil`
			`continue`
			`}`

			`// There's already a munger: wrap it`
			`existing := mungers[i]`
			`mungers[i] = func(v any) (any, error) {`
			`var err error`
			`v, err = existing(v)`
			`if err != nil {`
			`return v, err`
			`}`

			`return kind.MungeEmptyStringAsNil(v)`
			`}`
			`}`
			`}`
			`}`

Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00			`const (`
			`delimCommaKey = "comma"`
			`delimComma = ','`
			`delimSpaceKey = "space"`
			`delimSpace = ' '`
			`delimPipeKey = "pipe"`
			`delimPipe = '\|'`
			`delimTabKey = "tab"`
			`delimTab = '\t'`
			`delimColonKey = "colon"`
			`delimColon = ':'`
			`delimSemiKey = "semi"`
			`delimSemi = ';'`
			`delimPeriodKey = "period"`
			`delimPeriod = '.'`
			`)`

			`// NamedDelims returns the named delimiters, such as [comma, tab, pipe...].`
			`func NamedDelims() []string {`
			`return []string{`
			`delimCommaKey,`
			`delimTabKey,`
			`delimSemiKey,`
			`delimColonKey,`
			`delimSpaceKey,`
			`delimPipeKey,`
			`delimPeriodKey,`
			`}`
			`}`

codebase refactor 2020-08-06 20:58:47 +03:00			`// namedDelimiters is map of named delimiter strings to`
			`// rune value. For example, "comma" maps to ',' and "pipe" maps to '\|'.`
			`var namedDelimiters = map[string]rune{`
Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00			`delimCommaKey: delimComma,`
			`delimSpaceKey: delimSpace,`
			`delimPipeKey: delimPipe,`
			`delimTabKey: delimTab,`
			`delimColonKey: delimColon,`
			`delimSemiKey: delimSemi,`
			`delimPeriodKey: delimPeriod,`
codebase refactor 2020-08-06 20:58:47 +03:00			`}`

			`// getDelimiter returns the delimiter for src. An explicit`
			`// delimiter value may be set in src.Options; otherwise`
			`// the default for the source is returned.`
			`func getDelimiter(src *source.Source) (rune, error) {`
			`delim, ok, err := getDelimFromOptions(src.Options)`
			`if err != nil {`
			`return 0, err`
			`}`

			`if ok {`
			`return delim, nil`
			`}`

			`if src.Type == TypeTSV {`
			`return '\t', nil`
			`}`

			`// default is comma`
			`return ',', nil`
			`}`

			`// getDelimFromOptions returns ok as true and the delimiter rune if a`
			`// valid value is provided in src.Options, returns ok as false if`
			`// no valid value provided, and an error if the provided value is invalid.`
			`func getDelimFromOptions(opts options.Options) (r rune, ok bool, err error) {`
			`if len(opts) == 0 {`
			`return 0, false, nil`
			`}`

Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00			`val := OptDelim.Get(opts)`
codebase refactor 2020-08-06 20:58:47 +03:00
			`if len(val) == 1 {`
			`r, _ = utf8.DecodeRuneInString(val)`
			`return r, true, nil`
			`}`

			`r, ok = namedDelimiters[val]`

			`if !ok {`
Change logging library to slog (#175) - Switch to slog logger. 2023-04-02 22:49:45 +03:00			`err = errz.Errorf("unknown delimiter constant {%s}", val)`
codebase refactor 2020-08-06 20:58:47 +03:00			`return 0, false, err`
			`}`

			`return r, true, nil`
			`}`
Json driver; refactoring of core packages (#66) * Type Detector refactor * json driver impl; refactoring of source.Files reader et al * working on kind detector * significant switcheroo of packages * partway throught refactoring Kind * major package switcheroo for Kind 2020-08-23 13:42:15 +03:00
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00			`func newCSVReader(r io.Reader, delim rune) *csv.Reader {`
			`// We add the CR filter reader to deal with CSV files exported`
			`// from Excel which can have the DOS-style \r EOL markers.`
			`cr := csv.NewReader(&crFilterReader{r: r})`
			`cr.Comma = delim`
			`return cr`
			`}`

Json driver; refactoring of core packages (#66) * Type Detector refactor * json driver impl; refactoring of source.Files reader et al * working on kind detector * significant switcheroo of packages * partway throught refactoring Kind * major package switcheroo for Kind 2020-08-23 13:42:15 +03:00			`// crFilterReader is a reader whose Read method converts`
			`// standalone carriage return '\r' bytes to newline '\n'.`
			`// CRLF "\r\n" sequences are untouched.`
			`// This is useful for reading from DOS format, e.g. a TSV`
			`// file exported by Microsoft Excel.`
			`type crFilterReader struct {`
			`r io.Reader`
			`}`

			`func (r *crFilterReader) Read(p []byte) (n int, err error) {`
			`n, err = r.r.Read(p)`

			`for i := 0; i < n; i++ {`
			`if p[i] == 13 {`
			`if i+1 < n && p[i+1] == 10 {`
			`continue // it's \r\n`
			`}`
			`// it's just \r by itself, replace`
			`p[i] = 10`
			`}`
			`}`

			`return n, err`
			`}`
Auto-detect CSV headers (#188) * wip: initial header detection * wip: refactored csv driver * wip: Fixes to kind.Detector * typo * Refactor pkg csv files * Update to changelog * workflow: now running codacy and codeql on cron, not on push/pr 2023-04-09 17:44:27 +03:00
			`// readRecords reads a maximum of n records from cr.`
			`func readRecords(cr *csv.Reader, n int) ([][]string, error) {`
			`recs := make([][]string, 0, n)`

			`for i := 0; i < n; i++ {`
			`rec, err := cr.Read()`
			`if err != nil {`
			`if errors.Is(err, io.EOF) {`
			`return recs, nil`
			`}`

			`return nil, errz.Err(err)`
			`}`
			`recs = append(recs, rec)`
			`}`

			`return recs, nil`
			`}`