package xlsx import ( "context" "database/sql" "fmt" "slices" "strings" "time" "github.com/samber/lo" "github.com/neilotoole/sq/libsq/core/loz" "github.com/xuri/excelize/v2" "github.com/neilotoole/sq/libsq/core/options" "github.com/neilotoole/sq/libsq/core/lg/lga" "github.com/neilotoole/sq/libsq/core/lg/lgm" "github.com/neilotoole/sq/libsq/core/lg" "golang.org/x/sync/errgroup" "github.com/neilotoole/sq/libsq/core/errz" "github.com/neilotoole/sq/libsq/core/kind" "github.com/neilotoole/sq/libsq/source" "github.com/neilotoole/sq/libsq/core/sqlmodel" "github.com/neilotoole/sq/libsq/core/stringz" "github.com/neilotoole/sq/libsq/driver" ) const msgCloseRowIter = "Close Excel row iterator" func hasSheet(xfile *excelize.File, sheetName string) bool { return slices.Contains(xfile.GetSheetList(), sheetName) } // sheetTable maps a sheet to a database table. type sheetTable struct { sheet *xSheet def *sqlmodel.TableDef colIngestMungeFns []kind.MungeFunc hasHeaderRow bool } // xSheet encapsulates access to a worksheet. type xSheet struct { file *excelize.File name string sampleRows [][]string // sampleRowsMaxWidth is the width of the widest row in sampleRows. sampleRowsMaxWidth int } // loadSampleRows loads up to sampleSize rows, storing them to xSheet.sampleRows. // Note that the row count may be less than sampleSize, if there aren't // that many rows, or some rows are empty. func (xs *xSheet) loadSampleRows(ctx context.Context, sampleSize int) error { iter, err := xs.file.Rows(xs.name) if err != nil { return err } defer lg.WarnIfCloseError(lg.FromContext(ctx), msgCloseRowIter, iter) var count int for iter.Next() { if count >= sampleSize { break } var cells []string if cells, err = iter.Columns(); err != nil { return err } if !loz.IsSliceZeroed(cells) { xs.sampleRows = append(xs.sampleRows, cells) if len(cells) > xs.sampleRowsMaxWidth { xs.sampleRowsMaxWidth = len(cells) } } count++ } loz.AlignMatrixWidth(xs.sampleRows, "") return nil } // ingestXLSX loads the data in xfile into scratchPool. // If includeSheetNames is non-empty, only the named sheets are ingested. func ingestXLSX(ctx context.Context, src *source.Source, scratchPool driver.Pool, xfile *excelize.File, includeSheetNames []string, ) error { log := lg.FromContext(ctx) start := time.Now() log.Debug("Beginning import from XLSX", lga.Src, src, lga.Target, scratchPool.Source()) var sheets []*xSheet if len(includeSheetNames) > 0 { for _, sheetName := range includeSheetNames { if !hasSheet(xfile, sheetName) { return errz.Errorf("sheet {%s} not found", sheetName) } sheets = append(sheets, &xSheet{file: xfile, name: sheetName}) } } else { sheetNames := xfile.GetSheetList() sheets = make([]*xSheet, len(sheetNames)) for i := range sheetNames { sheets[i] = &xSheet{file: xfile, name: sheetNames[i]} } } srcIngestHeader := getSrcIngestHeader(src.Options) sheetTbls, err := buildSheetTables(ctx, srcIngestHeader, sheets) if err != nil { return err } for _, sheetTbl := range sheetTbls { if sheetTbl == nil { // tblDef can be nil if its sheet is empty (has no data). continue } var db *sql.DB if db, err = scratchPool.DB(ctx); err != nil { return err } if err = scratchPool.SQLDriver().CreateTable(ctx, db, sheetTbl.def); err != nil { return err } } log.Debug("Tables created (but not yet populated)", lga.Count, len(sheetTbls), lga.Target, scratchPool.Source(), lga.Elapsed, time.Since(start)) var imported, skipped int for i := range sheetTbls { if sheetTbls[i] == nil { // tblDef can be nil if its sheet is empty (has no data). skipped++ continue } if err = ingestSheetToTable(ctx, scratchPool, sheetTbls[i]); err != nil { return err } imported++ } log.Debug("Sheets imported", lga.Count, imported, "skipped", skipped, lga.From, src, lga.To, scratchPool.Source(), lga.Elapsed, time.Since(start), ) return nil } // ingestSheetToTable imports the sheet data into the appropriate table // in scratchPool. The scratch table must already exist. func ingestSheetToTable(ctx context.Context, scratchPool driver.Pool, sheetTbl *sheetTable) error { var ( log = lg.FromContext(ctx) startTime = time.Now() sheet = sheetTbl.sheet hasHeader = sheetTbl.hasHeaderRow tblDef = sheetTbl.def destColKinds = tblDef.ColKinds() ) db, err := scratchPool.DB(ctx) if err != nil { return err } conn, err := db.Conn(ctx) if err != nil { return errz.Err(err) } defer lg.WarnIfCloseError(log, lgm.CloseDB, conn) drvr := scratchPool.SQLDriver() batchSize := driver.MaxBatchRows(drvr, len(destColKinds)) bi, err := driver.NewBatchInsert(ctx, drvr, conn, tblDef.Name, tblDef.ColNames(), batchSize) if err != nil { return err } iter, err := sheetTbl.sheet.file.Rows(sheetTbl.sheet.name) if err != nil { return errw(err) } defer lg.WarnIfCloseError(log, msgCloseRowIter, iter) var cells []string i := -1 for iter.Next() { i++ if hasHeader && i == 0 { continue } if cells, err = iter.Columns(); err != nil { close(bi.RecordCh) return err } if loz.IsSliceZeroed(cells) { // Skip empty row continue } rec := rowToRecord(ctx, destColKinds, sheetTbl.colIngestMungeFns, sheet.name, i, cells) if err = bi.Munge(rec); err != nil { close(bi.RecordCh) return err } select { case <-ctx.Done(): close(bi.RecordCh) return ctx.Err() case err = <-bi.ErrCh: if err != nil { close(bi.RecordCh) return err } // The batch inserter successfully completed break case bi.RecordCh <- rec: } } close(bi.RecordCh) // Indicate that we're finished writing records err = <-bi.ErrCh // Wait for bi to complete if err != nil { return err } if err = iter.Error(); err != nil { return errz.Wrap(err, "excel: sheet iterator") } log.Debug("Inserted rows from sheet into table", lga.Count, bi.Written(), laSheet, sheet.name, lga.Target, source.Target(scratchPool.Source(), tblDef.Name), lga.Elapsed, time.Since(startTime)) return nil } // buildSheetTables executes buildSheetTable for each sheet. If sheet is // empty (has no data), the sheetTable element for that sheet will be nil. func buildSheetTables(ctx context.Context, srcIngestHeader *bool, sheets []*xSheet) ([]*sheetTable, error) { sheetTbls := make([]*sheetTable, len(sheets)) g, gCtx := errgroup.WithContext(ctx) for i := range sheets { i := i g.Go(func() error { select { case <-gCtx.Done(): return gCtx.Err() default: } sheetTbl, err := buildSheetTable(gCtx, srcIngestHeader, sheets[i]) if err != nil { if errz.IsErrNoData(err) { // If the sheet has no data, we log it and skip it. lg.FromContext(ctx).Warn("Excel sheet has no data", laSheet, sheets[i].name, lga.Err, err) return nil } return err } sheetTbls[i] = sheetTbl return nil }) } if err := g.Wait(); err != nil { return nil, err } // Remove any nil sheets (which can happen if the sheet is empty). sheetTbls = lo.Compact(sheetTbls) return sheetTbls, nil } // getSrcIngestHeader returns nil if driver.OptIngestHeader is not set, // and has the value of the opt if set. func getSrcIngestHeader(o options.Options) *bool { if driver.OptIngestHeader.IsSet(o) { b := driver.OptIngestHeader.Get(o) return &b } return nil } // buildSheetTable constructs a table definition for the given sheet, and returns // a model of the table, or an error. If the sheet is empty, (nil,nil) // is returned. If srcIngestHeader is nil, the function attempts // to detect if the sheet has a header row. // If the sheet has no data, errz.NoDataError is returned. func buildSheetTable(ctx context.Context, srcIngestHeader *bool, sheet *xSheet) (*sheetTable, error) { log := lg.FromContext(ctx) sampleSize := driver.OptIngestSampleSize.Get(options.FromContext(ctx)) if err := sheet.loadSampleRows(ctx, sampleSize); err != nil { return nil, err } if len(sheet.sampleRows) == 0 { return nil, errz.NoDataf("excel: sheet {%s} has no row data", sheet.name) } if sheet.sampleRowsMaxWidth == 0 { return nil, errz.NoDataf("excel: sheet {%s} has no column data", sheet.name) } var hasHeader bool if srcIngestHeader != nil { hasHeader = *srcIngestHeader } else { var err error if hasHeader, err = detectHeaderRow(ctx, sheet); err != nil { return nil, err } log.Debug("Detect header row for sheet", laSheet, sheet.name, lga.Val, hasHeader) } maxCols := sheet.sampleRowsMaxWidth if maxCols == 0 { log.Warn("sheet is empty: skipping", laSheet, sheet.name) return nil, nil //nolint:nilnil } colNames := make([]string, maxCols) colKinds := make([]kind.Kind, maxCols) colIngestMungeFns := make([]kind.MungeFunc, maxCols) firstDataRow := 0 // sheet is non-empty // Set up the column names if hasHeader { firstDataRow = 1 copy(colNames, sheet.sampleRows[0]) } else { for i := 0; i < maxCols; i++ { colNames[i] = stringz.GenerateAlphaColName(i, false) } } // Set up the column types if firstDataRow >= len(sheet.sampleRows) { // the sheet contains only one row (the header row). Let's // explicitly set the column type nonetheless for i := 0; i < maxCols; i++ { colKinds[i] = kind.Text } } else { // we have at least one data row, let's get the column types var err error colKinds, colIngestMungeFns, err = detectSheetColumnKinds(sheet, firstDataRow) if err != nil { return nil, err } } colNames, colKinds = syncColNamesKinds(colNames, colKinds) var err error if colNames, err = driver.MungeIngestColNames(ctx, colNames); err != nil { return nil, err } tblDef := &sqlmodel.TableDef{Name: sheet.name} cols := make([]*sqlmodel.ColDef, len(colNames)) for i, colName := range colNames { cols[i] = &sqlmodel.ColDef{Table: tblDef, Name: colName, Kind: colKinds[i]} } tblDef.Cols = cols lg.FromContext(ctx).Debug("Built table def", laSheet, sheet.name, "cols", strings.Join(colNames, ", ")) return &sheetTable{ sheet: sheet, def: tblDef, hasHeaderRow: hasHeader, colIngestMungeFns: colIngestMungeFns, }, nil } // syncColNamesKinds ensures that column names and kinds are in // a working state vis-à-vis each other. Notably if a colName is // empty and its equivalent kind is kind.Null, that element // is filtered out. func syncColNamesKinds(colNames []string, colKinds []kind.Kind) (names []string, kinds []kind.Kind) { // Allow for the case of "phantom" columns. That is, // columns with entirely empty data. // Note: not sure if this scenario is now reachable if len(colKinds) < len(colNames) { colNames = colNames[0:len(colKinds)] } for i := range colNames { // Filter out the case where the column name is empty // and the kind is kind.Null or kind.Unknown. if colNames[i] == "" && (colKinds[i] == kind.Null || colKinds[i] == kind.Unknown) { continue } names = append(names, colNames[i]) kinds = append(kinds, colKinds[i]) } colNames = names colKinds = kinds // Check that we don't have any unnamed columns (empty header) for i := 0; i < len(colNames); i++ { if colNames[i] == "" { // Empty col name... possibly we should just throw // an error, but instead we'll try to generate a col name. colName := stringz.GenerateAlphaColName(i, false) for stringz.InSlice(colNames[0:i], colName) { // If colName already exists, just append an // underscore and try again. colName += "_" } colNames[i] = colName } } for i := range colKinds { if colKinds[i] == kind.Null || colKinds[i] == kind.Unknown { colKinds[i] = kind.Text } } return colNames, colKinds } // rowToRecord accepts a row (in arg cells), and converts it into an appropriate // format for insertion to the DB. func rowToRecord(ctx context.Context, destColKinds []kind.Kind, ingestMungeFns []kind.MungeFunc, sheetName string, rowi int, cells []string, ) []any { log := lg.FromContext(ctx) vals := make([]any, len(destColKinds)) for coli, str := range cells { if coli >= len(vals) { log.Warn( "Skipping additional cells because there's more cells than expected", laSheet, sheetName, lga.Col, fmt.Sprintf("%d:%d", rowi, coli), lga.Count, len(vals), lga.Expected, len(destColKinds), ) continue } if str == "" { vals[coli] = nil continue } if fn := ingestMungeFns[coli]; fn != nil { v, err := fn(str) if err != nil { // This shouldn't happen, but if it does, fall back // to the string value. vals[coli] = str log.Warn("Cell munge func failed", laSheet, sheetName, "cell", fmt.Sprintf("%d:%d", rowi, coli), lga.Val, vals[coli], ) } else { vals[coli] = v } continue } // No munge func, just set the string. vals[coli] = str } return vals }