fixed some panics with oddly-shaped xlsx imports (#71)

This commit is contained in:
Neil O'Toole 2020-11-02 10:40:29 -07:00 committed by GitHub
parent 93f0de8614
commit 06900f8c84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 315 additions and 83 deletions

View File

@ -1,14 +1,14 @@
// Package buildinfo hosts build info variables populated via ldflags.
package buildinfo
// DefaultVersion is the default value for Version if not
// defaultVersion is the default value for Version if not
// set via ldflags.
const DefaultVersion = "v0.0.0-dev"
const defaultVersion = "v0.0.0-dev"
var (
// Version is the build version. If not set at build time via
// ldflags, Version takes the value of DefaultVersion.
Version = DefaultVersion
// ldflags, Version takes the value of defaultVersion.
Version = defaultVersion
// Commit is the commit hash.
Commit string

View File

@ -9,6 +9,7 @@ import (
"github.com/neilotoole/sq/drivers/csv"
"github.com/neilotoole/sq/drivers/sqlite3"
"github.com/neilotoole/sq/drivers/xlsx"
"github.com/neilotoole/sq/libsq/source"
"github.com/neilotoole/sq/testh"
"github.com/neilotoole/sq/testh/proj"
@ -16,6 +17,62 @@ import (
)
func TestCmdInspect(t *testing.T) {
testCases := []struct {
handle string
wantErr bool
wantType source.Type
wantTbls []string
}{
{
handle: sakila.CSVActor,
wantType: csv.TypeCSV,
wantTbls: []string{source.MonotableName},
},
{
handle: sakila.TSVActor,
wantType: csv.TypeTSV,
wantTbls: []string{source.MonotableName},
},
{
handle: sakila.XLSX,
wantType: xlsx.Type,
wantTbls: sakila.AllTbls(),
},
{
handle: sakila.SL3,
wantType: sqlite3.Type,
wantTbls: sakila.AllTblsViews(),
},
}
for _, tc := range testCases {
tc := tc
t.Run(tc.handle, func(t *testing.T) {
t.Parallel()
th := testh.New(t)
src := th.Source(tc.handle)
ru := newRun(t).add(*src)
err := ru.exec("inspect", "--json")
if tc.wantErr {
require.Error(t, err)
return
}
md := &source.Metadata{}
require.NoError(t, json.Unmarshal(ru.out.Bytes(), md))
require.Equal(t, tc.wantType, md.SourceType)
require.Equal(t, src.Handle, md.Handle)
require.Equal(t, src.Location, md.Location)
require.Equal(t, tc.wantTbls, md.TableNames())
})
}
}
func TestCmdInspectSmoke(t *testing.T) {
th := testh.New(t)
src := th.Source(sakila.SL3)

View File

@ -91,7 +91,7 @@ func execSLQ(rc *RunContext, cmd *cobra.Command, args []string) error {
return execSLQInsert(rc, destSrc, destTbl)
}
// execSQLInsert executes the SQL and inserts resulting records
// execSQLInsert executes the SLQ and inserts resulting records
// into destTbl in destSrc.
func execSLQInsert(rc *RunContext, destSrc *source.Source, destTbl string) error {
args, srcs, dbases := rc.Args, rc.Config.Sources, rc.databases
@ -114,20 +114,21 @@ func execSLQInsert(rc *RunContext, destSrc *source.Source, destTbl string) error
// stack.
inserter := libsq.NewDBWriter(rc.Log, destDB, destTbl, driver.Tuning.RecordChSize)
err = libsq.ExecuteSLQ(ctx, rc.Log, rc.databases, rc.databases, srcs, slq, inserter)
if err != nil {
return errz.Wrapf(err, "insert %s.%s failed", destSrc.Handle, destTbl)
execErr := libsq.ExecuteSLQ(ctx, rc.Log, rc.databases, rc.databases, srcs, slq, inserter)
affected, waitErr := inserter.Wait() // Wait for the writer to finish processing
if execErr != nil {
return errz.Wrapf(execErr, "insert %s.%s failed", destSrc.Handle, destTbl)
}
affected, err := inserter.Wait() // Wait for the writer to finish processing
if err != nil {
return errz.Wrapf(err, "insert %s.%s failed", destSrc.Handle, destTbl)
if waitErr != nil {
return errz.Wrapf(waitErr, "insert %s.%s failed", destSrc.Handle, destTbl)
}
fmt.Fprintf(rc.Out, stringz.Plu("Inserted %d row(s) into %s.%s\n", int(affected)), affected, destSrc.Handle, destTbl)
return nil
}
// execSLQPrint executes the SLQ query, and prints output to writer.
func execSLQPrint(rc *RunContext) error {
slq, err := preprocessUserSLQ(rc, rc.Args)
if err != nil {
@ -135,17 +136,13 @@ func execSLQPrint(rc *RunContext) error {
}
recw := output.NewRecordWriterAdapter(rc.writers.recordw)
err = libsq.ExecuteSLQ(rc.Context, rc.Log, rc.databases, rc.databases, rc.Config.Sources, slq, recw)
if err != nil {
return err
execErr := libsq.ExecuteSLQ(rc.Context, rc.Log, rc.databases, rc.databases, rc.Config.Sources, slq, recw)
_, waitErr := recw.Wait()
if execErr != nil {
return execErr
}
_, err = recw.Wait()
if err != nil {
return err
}
return err
return waitErr
}
// preprocessUserSLQ does a bit of validation and munging on the

View File

@ -113,11 +113,13 @@ func TestCmdSQL_StdinQuery(t *testing.T) {
testCases := []struct {
fpath string
tbl string
srcHeader bool
wantCount int
wantErr bool
}{
{fpath: proj.Abs(sakila.PathCSVActorNoHeader), tbl: source.MonotableName, wantCount: sakila.TblActorCount},
{fpath: proj.Abs(sakila.PathXLSXSubset), tbl: sakila.TblActor, wantCount: sakila.TblActorCount + 1}, // +1 is for the header row in the XLSX file
{fpath: proj.Abs(sakila.PathXLSXActorHeader), srcHeader: true, tbl: sakila.TblActor, wantCount: sakila.TblActorCount},
{fpath: proj.Abs(sakila.PathXLSXSubset), srcHeader: true, tbl: sakila.TblActor, wantCount: sakila.TblActorCount},
{fpath: proj.Abs("README.md"), wantErr: true},
}
@ -133,7 +135,12 @@ func TestCmdSQL_StdinQuery(t *testing.T) {
ru := newRun(t).hush()
ru.rc.Stdin = f
err = ru.exec("sql", "--header=false", "SELECT * FROM "+tc.tbl)
args := []string{"sql", "--header=false", "SELECT * FROM " + tc.tbl}
if tc.srcHeader {
args = append(args, "--opts=header=true")
}
err = ru.exec(args...)
if tc.wantErr {
require.Error(t, err)
return

View File

@ -155,15 +155,19 @@ func (p *processor) buildSchemaFlat() (*importSchema, error) {
for _, field := range e.fieldNames {
if detector, ok := e.detectors[field]; ok {
// If it has a detector, it's a regular field
kind, mungeFn, err := detector.Detect()
k, mungeFn, err := detector.Detect()
if err != nil {
return errz.Err(err)
}
if k == kind.Null {
k = kind.Text
}
colDef := &sqlmodel.ColDef{
Name: p.calcColName(e, field),
Table: tblDef,
Kind: kind,
Kind: k,
}
colDefs = append(colDefs, colDef)

View File

@ -299,6 +299,9 @@ func detectColKindsJSONA(ctx context.Context, r io.Reader) ([]kind.Kind, []kind.
if err != nil {
return nil, nil, err
}
if kinds[i] == kind.Null {
kinds[i] = kind.Text
}
}
return kinds, mungeFns, nil

View File

@ -84,6 +84,10 @@ func importSheetToTable(ctx context.Context, log lg.Log, sheet *xlsx.Sheet, hasH
continue
}
if isEmptyRow(row) {
continue
}
rec := rowToRecord(log, destColKinds, row, sheet.Name, i)
err = bi.Munge(rec)
if err != nil {
@ -120,6 +124,22 @@ func importSheetToTable(ctx context.Context, log lg.Log, sheet *xlsx.Sheet, hasH
return nil
}
// isEmptyRow returns true if row is nil or has zero cells, or if
// every cell value is empty string.
func isEmptyRow(row *xlsx.Row) bool {
if row == nil || len(row.Cells) == 0 {
return true
}
for i := range row.Cells {
if row.Cells[i].Value != "" {
return false
}
}
return true
}
// buildTblDefsForSheets returns a TableDef for each sheet.
func buildTblDefsForSheets(ctx context.Context, log lg.Log, sheets []*xlsx.Sheet, hasHeader bool) ([]*sqlmodel.TableDef, error) {
tblDefs := make([]*sqlmodel.TableDef, len(sheets))
@ -148,22 +168,21 @@ func buildTblDefsForSheets(ctx context.Context, log lg.Log, sheets []*xlsx.Sheet
// buildTblDefForSheet creates a table for the given sheet, and returns
// a model of the table, or an error.
func buildTblDefForSheet(log lg.Log, sheet *xlsx.Sheet, hasHeader bool) (*sqlmodel.TableDef, error) {
numCells := getRowsMaxCellCount(sheet)
if numCells == 0 {
maxCols := getRowsMaxCellCount(sheet)
if maxCols == 0 {
return nil, errz.Errorf("sheet %q has no columns", sheet.Name)
}
colNames := make([]string, numCells)
colKinds := make([]kind.Kind, numCells)
colNames := make([]string, maxCols)
colKinds := make([]kind.Kind, maxCols)
firstDataRow := 0
if len(sheet.Rows) == 0 {
// TODO: is this even reachable? That is, if sheet.Rows is empty,
// then sheet.cols (checked for above) will also be empty?
// sheet has no rows
for i := 0; i < numCells; i++ {
for i := 0; i < maxCols; i++ {
colKinds[i] = kind.Text
colNames[i] = stringz.GenerateAlphaColName(i, false)
}
@ -174,11 +193,11 @@ func buildTblDefForSheet(log lg.Log, sheet *xlsx.Sheet, hasHeader bool) (*sqlmod
if hasHeader {
firstDataRow = 1
headerCells := sheet.Rows[0].Cells
for i := 0; i < numCells; i++ {
for i := 0; i < len(headerCells); i++ {
colNames[i] = headerCells[i].Value
}
} else {
for i := 0; i < numCells; i++ {
for i := 0; i < maxCols; i++ {
colNames[i] = stringz.GenerateAlphaColName(i, false)
}
}
@ -186,16 +205,22 @@ func buildTblDefForSheet(log lg.Log, sheet *xlsx.Sheet, hasHeader bool) (*sqlmod
// Set up the column types
if firstDataRow >= len(sheet.Rows) {
// the sheet contains only one row (the header row). Let's
// explicitly set the column type none the less
for i := 0; i < numCells; i++ {
// explicitly set the column type nonetheless
for i := 0; i < maxCols; i++ {
colKinds[i] = kind.Text
}
} else {
// we have at least one data row, let's get the column types
colKinds = getKindsFromCells(sheet.Rows[firstDataRow].Cells)
var err error
colKinds, err = calcKindsForRows(firstDataRow, sheet.Rows)
if err != nil {
return nil, err
}
}
}
colNames, colKinds = syncColNamesKinds(colNames, colKinds)
tblDef := &sqlmodel.TableDef{Name: sheet.Name}
cols := make([]*sqlmodel.ColDef, len(colNames))
for i, colName := range colNames {
@ -207,9 +232,65 @@ func buildTblDefForSheet(log lg.Log, sheet *xlsx.Sheet, hasHeader bool) (*sqlmod
return tblDef, nil
}
// syncColNamesKinds ensures that column names and kinds are in
// a working state vis-a-vis each other. Notably if a colName is
// empty and its equivalent kind is kind.Null, that element
// is filtered out.
func syncColNamesKinds(colNames []string, colKinds []kind.Kind) (names []string, kinds []kind.Kind) {
// Allow for the case of "phantom" columns. That is,
// columns with entirely empty data.
// Note: not sure if this scenario is now reachable
if len(colKinds) < len(colNames) {
colNames = colNames[0:len(colKinds)]
}
for i := range colNames {
// Filter out the case where the column name is empty
// and the kind is kind.Null or kind.Unknown.
if colNames[i] == "" && (colKinds[i] == kind.Null || colKinds[i] == kind.Unknown) {
continue
}
names = append(names, colNames[i])
kinds = append(kinds, colKinds[i])
}
colNames = names
colKinds = kinds
// Check that we don't have any unnamed columns (empty header)
for i := 0; i < len(colNames); i++ {
if colNames[i] == "" {
// Empty col name... possibly we should just throw
// an error, but instead we'll try to generate a col name.
colName := stringz.GenerateAlphaColName(i, false)
for stringz.InSlice(colNames[0:i], colName) {
// If colName already exists, just append an
// underscore and try again.
colName += "_"
}
colNames[i] = colName
}
}
for i := range colKinds {
if colKinds[i] == kind.Null || colKinds[i] == kind.Unknown {
colKinds[i] = kind.Text
}
}
return colNames, colKinds
}
func rowToRecord(log lg.Log, destColKinds []kind.Kind, row *xlsx.Row, sheetName string, rowIndex int) []interface{} {
vals := make([]interface{}, len(row.Cells))
vals := make([]interface{}, len(destColKinds))
for j, cell := range row.Cells {
if j >= len(vals) {
log.Warnf("sheet %s[%d:%d]: skipping additional cells because there's more cells than expected (%d)",
sheetName, rowIndex, j, len(destColKinds))
continue
}
typ := cell.Type()
switch typ {
case xlsx.CellTypeBool:
@ -271,42 +352,100 @@ func rowToRecord(log lg.Log, destColKinds []kind.Kind, row *xlsx.Row, sheetName
return vals
}
func getKindsFromCells(cells []*xlsx.Cell) []kind.Kind {
vals := make([]kind.Kind, len(cells))
for i, cell := range cells {
typ := cell.Type()
switch typ {
case xlsx.CellTypeBool:
vals[i] = kind.Bool
case xlsx.CellTypeNumeric:
if cell.IsTime() {
vals[i] = kind.Datetime
continue
}
// readCellValue reads the value of a cell, returning a value of
// type that most matches the sq kind.
func readCellValue(cell *xlsx.Cell) interface{} {
if cell == nil || cell.Value == "" {
return nil
}
_, err := cell.Int64()
var val interface{}
switch cell.Type() {
case xlsx.CellTypeBool:
val = cell.Bool()
return val
case xlsx.CellTypeNumeric:
if cell.IsTime() {
t, err := cell.GetTime(false)
if err == nil {
vals[i] = kind.Int
continue
return t
}
_, err = cell.Float()
t, err = cell.GetTime(true)
if err == nil {
vals[i] = kind.Float
continue
return t
}
// it's not an int, it's not a float
vals[i] = kind.Decimal
case xlsx.CellTypeDate:
// TODO: support time values here?
vals[i] = kind.Datetime
// Otherwise we have an error, just return the value
val, _ = cell.FormattedValue()
return val
}
default:
vals[i] = kind.Text
intVal, err := cell.Int64()
if err == nil {
val = intVal
return val
}
floatVal, err := cell.Float()
if err == nil {
val = floatVal
return val
}
val, _ = cell.FormattedValue()
return val
case xlsx.CellTypeString:
val = cell.String()
case xlsx.CellTypeDate:
// TODO: parse into a time.Time value here?
val, _ = cell.FormattedValue()
default:
val, _ = cell.FormattedValue()
}
return val
}
// calcKindsForRows calculates the lowest-common-denominator kind
// for the cells of rows. The returned slice will have length
// equal to the longest row.
func calcKindsForRows(firstDataRow int, rows []*xlsx.Row) ([]kind.Kind, error) {
if firstDataRow > len(rows) {
return nil, errz.Errorf("rows are empty")
}
var detectors []*kind.Detector
for i := firstDataRow; i < len(rows); i++ {
if isEmptyRow(rows[i]) {
continue
}
for j := len(detectors); j < len(rows[i].Cells); j++ {
detectors = append(detectors, kind.NewDetector())
}
for j := range rows[i].Cells {
val := readCellValue(rows[i].Cells[j])
detectors[j].Sample(val)
}
}
return vals
kinds := make([]kind.Kind, len(detectors))
for j := range detectors {
knd, _, err := detectors[j].Detect()
if err != nil {
return nil, err
}
kinds[j] = knd
}
return kinds, nil
}
// getColNames returns column names for the sheet. If hasHeader is true and there's
@ -318,22 +457,23 @@ func getColNames(sheet *xlsx.Sheet, hasHeader bool) []string {
if len(sheet.Rows) > 0 && hasHeader {
row := sheet.Rows[0]
for i := 0; i < numCells; i++ {
colNames[i] = row.Cells[i].Value
for i := 0; i < len(row.Cells); i++ {
colNames[i] = row.Cells[i].String()
}
return colNames
}
for i := 0; i < numCells; i++ {
colNames[i] = stringz.GenerateAlphaColName(i, false)
for i := range colNames {
if colNames[i] == "" {
colNames[i] = stringz.GenerateAlphaColName(i, false)
}
}
return colNames
}
// getColTypes returns the xlsx column types for the sheet, determined from
// getCellColumnTypes returns the xlsx cell types for the sheet, determined from
// the values of the first data row (after any header row).
func getColTypes(sheet *xlsx.Sheet, hasHeader bool) []xlsx.CellType {
func getCellColumnTypes(sheet *xlsx.Sheet, hasHeader bool) []xlsx.CellType {
types := make([]*xlsx.CellType, getRowsMaxCellCount(sheet))
firstDataRow := 0
if hasHeader {

Binary file not shown.

View File

@ -190,6 +190,11 @@ func (d *database) TableMetadata(ctx context.Context, tblName string) (*source.T
}
// SourceMetadata implements driver.Database.
// TODO: the implementation of SourceMetadata is out
// of sync with the way we import data. For example, empty
// rows are filtered out during import, and empty columns
// are discarded. Thus SourceMetadata needs an overhaul to
// bring its reporting into line with import.
func (d *database) SourceMetadata(ctx context.Context) (*source.Metadata, error) {
meta := &source.Metadata{Handle: d.src.Handle}
@ -231,7 +236,9 @@ func (d *database) SourceMetadata(ctx context.Context) (*source.Metadata, error)
}
colNames := getColNames(sheet, hasHeader)
colTypes := getColTypes(sheet, hasHeader)
// TODO: Should move over to using kind.Detector
colTypes := getCellColumnTypes(sheet, hasHeader)
for i, colType := range colTypes {
col := &source.ColMetadata{}

View File

@ -198,6 +198,7 @@ func (d *Detector) Sample(v interface{}) {
return
case time.Time:
d.retain(Time, Date, Datetime)
return
case stdj.Number:
// JSON number
d.foundString = true
@ -274,7 +275,7 @@ func (d *Detector) Sample(v interface{}) {
return nil, nil
}
s, ok := val.(string)
s, ok = val.(string)
if !ok {
return nil, errz.Errorf("expected %T to be string", val)
}
@ -283,7 +284,8 @@ func (d *Detector) Sample(v interface{}) {
return nil, nil
}
t, err := time.Parse(format, s)
var t time.Time
t, err = time.Parse(format, s)
if err != nil {
return nil, errz.Err(err)
}
@ -307,7 +309,7 @@ func (d *Detector) Sample(v interface{}) {
return nil, nil
}
s, ok := val.(string)
s, ok = val.(string)
if !ok {
return nil, errz.Errorf("expected %T to be string", val)
}
@ -316,7 +318,8 @@ func (d *Detector) Sample(v interface{}) {
return nil, nil
}
t, err := time.Parse(format, s)
var t time.Time
t, err = time.Parse(format, s)
if err != nil {
return nil, errz.Err(err)
}
@ -362,7 +365,8 @@ func (d *Detector) Sample(v interface{}) {
}
}
// Detect returns the detected Kind. If ambiguous, Text is returned.
// Detect returns the detected Kind. If ambiguous, Text is returned,
// unless all sampled values were nil, in which case Null is returned.
// If the returned mungeFn is non-nil, it can be used to convert input
// values to their canonical form. For example, for Datetime the MungeFunc
// would accept string "2020-06-11T02:50:54Z" and return a time.Time,
@ -370,8 +374,12 @@ func (d *Detector) Sample(v interface{}) {
// and always return a string in the canonicalized form "1970-01-01".
func (d *Detector) Detect() (kind Kind, mungeFn MungeFunc, err error) {
if !d.dirty {
if d.foundString {
return Text, nil, nil
}
// If we haven't filtered any kinds, default to Text.
return Text, nil, nil
return Null, nil, nil
}
switch len(d.kinds) {
@ -381,7 +389,15 @@ func (d *Detector) Detect() (kind Kind, mungeFn MungeFunc, err error) {
for k := range d.kinds {
return k, d.mungeFns[k], nil
}
default:
}
// NOTE: this logic below about detecting the remaining type
// is a bit sketchy. If you're debugging this code, it's
// probably the case that the code below is faulty.
if d.has(Time, Date, Datetime) {
// If all three time types are left, use the most
// general, i.e. Datetime.
return Datetime, d.mungeFns[Datetime], nil
}
if d.has(Time) {

View File

@ -88,10 +88,10 @@ func TestKindDetector(t *testing.T) {
wantMunge bool
wantErr bool
}{
{in: nil, want: kind.Text},
{in: []interface{}{}, want: kind.Text},
{in: nil, want: kind.Null},
{in: []interface{}{}, want: kind.Null},
{in: []interface{}{""}, want: kind.Text},
{in: []interface{}{nil}, want: kind.Text},
{in: []interface{}{nil}, want: kind.Null},
{in: []interface{}{nil, ""}, want: kind.Text},
{in: []interface{}{int(1), int8(8), int16(16), int32(32), int64(64)}, want: kind.Int},
{in: []interface{}{1, "2", "3"}, want: kind.Decimal},
@ -100,7 +100,7 @@ func TestKindDetector(t *testing.T) {
{in: []interface{}{1, "2", stdj.Number("1000")}, want: kind.Decimal},
{in: []interface{}{1.0, "2.0"}, want: kind.Decimal},
{in: []interface{}{1, float64(2.0), float32(7.7), int32(3)}, want: kind.Float},
{in: []interface{}{nil, nil, nil}, want: kind.Text},
{in: []interface{}{nil, nil, nil}, want: kind.Null},
{in: []interface{}{"1.0", "2.0", "3.0", "4", nil, int64(6)}, want: kind.Decimal},
{in: []interface{}{true, false, nil, "true", "false", "yes", "no", ""}, want: kind.Bool},
{in: []interface{}{"0", "1"}, want: kind.Decimal},
@ -145,7 +145,7 @@ func TestKindDetector(t *testing.T) {
return
}
require.Equal(t, tc.want.String(), gotKind.String())
require.Equal(t, tc.want.String(), gotKind.String(), tc.in)
if !tc.wantMunge {
require.Nil(t, gotMungeFn)

View File

@ -29,7 +29,7 @@ const (
MS = MS17
)
// AllHandles returns all the sakila handles. It does not
// AllHandles returns all the typical sakila handles. It does not
// include monotable handles such as @sakila_csv_actor.
func AllHandles() []string {
return []string{SL3, Pg9, Pg10, Pg11, Pg12, My56, My57, My8, MS17, XLSX}
@ -145,6 +145,7 @@ const (
PathSL3 = "drivers/sqlite3/testdata/sakila.db"
PathXLSX = "drivers/xlsx/testdata/sakila.xlsx"
PathXLSXSubset = "drivers/xlsx/testdata/sakila_subset.xlsx"
PathXLSXActorHeader = "drivers/xlsx/testdata/sakila_actor_header.xlsx"
PathCSVActor = "drivers/csv/testdata/sakila-csv/actor.csv"
PathCSVActorNoHeader = "drivers/csv/testdata/sakila-csv-noheader/actor.csv"
PathTSVActor = "drivers/csv/testdata/sakila-tsv/actor.tsv"