2020-08-06 20:58:47 +03:00
|
|
|
// Package xlsx implements the sq driver for Microsoft Excel.
|
|
|
|
package xlsx
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2020-08-12 21:24:01 +03:00
|
|
|
"database/sql"
|
2022-12-18 05:43:53 +03:00
|
|
|
"io"
|
|
|
|
|
2023-04-02 22:49:45 +03:00
|
|
|
"github.com/neilotoole/sq/libsq/core/lg/lga"
|
|
|
|
|
|
|
|
"github.com/neilotoole/sq/libsq/core/lg/lgm"
|
|
|
|
|
|
|
|
"github.com/neilotoole/sq/libsq/core/lg"
|
|
|
|
|
|
|
|
"golang.org/x/exp/slog"
|
|
|
|
|
2020-08-06 20:58:47 +03:00
|
|
|
"github.com/tealeg/xlsx/v2"
|
|
|
|
|
2020-08-23 13:42:15 +03:00
|
|
|
"github.com/neilotoole/sq/libsq/core/cleanup"
|
|
|
|
"github.com/neilotoole/sq/libsq/core/errz"
|
2020-08-06 20:58:47 +03:00
|
|
|
"github.com/neilotoole/sq/libsq/driver"
|
|
|
|
"github.com/neilotoole/sq/libsq/source"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
// Type is the sq source driver type for XLSX.
|
2023-04-22 06:36:32 +03:00
|
|
|
Type = source.DriverType("xlsx")
|
2020-08-06 20:58:47 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
// Provider implements driver.Provider.
|
|
|
|
type Provider struct {
|
2023-04-02 22:49:45 +03:00
|
|
|
Log *slog.Logger
|
2020-08-06 20:58:47 +03:00
|
|
|
Files *source.Files
|
|
|
|
Scratcher driver.ScratchDatabaseOpener
|
|
|
|
}
|
|
|
|
|
|
|
|
// DriverFor implements driver.Provider.
|
2023-04-22 06:36:32 +03:00
|
|
|
func (p *Provider) DriverFor(typ source.DriverType) (driver.Driver, error) {
|
2020-08-06 20:58:47 +03:00
|
|
|
if typ != Type {
|
2023-04-02 22:49:45 +03:00
|
|
|
return nil, errz.Errorf("unsupported driver type {%s}", typ)
|
2020-08-06 20:58:47 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return &Driver{log: p.Log, scratcher: p.Scratcher, files: p.Files}, nil
|
|
|
|
}
|
|
|
|
|
2023-04-22 06:36:32 +03:00
|
|
|
var _ source.DriverDetectFunc = DetectXLSX
|
2020-08-23 13:42:15 +03:00
|
|
|
|
2023-04-22 06:36:32 +03:00
|
|
|
// DetectXLSX implements source.DriverDetectFunc, returning
|
2020-08-23 13:42:15 +03:00
|
|
|
// TypeXLSX and a score of 1.0 valid XLSX.
|
2023-04-22 06:36:32 +03:00
|
|
|
func DetectXLSX(ctx context.Context, openFn source.FileOpenFunc) (detected source.DriverType, score float32,
|
2022-12-18 11:35:59 +03:00
|
|
|
err error,
|
|
|
|
) {
|
2023-05-03 15:36:10 +03:00
|
|
|
log := lg.FromContext(ctx)
|
2020-08-23 13:42:15 +03:00
|
|
|
var r io.ReadCloser
|
|
|
|
r, err = openFn()
|
|
|
|
if err != nil {
|
|
|
|
return source.TypeNone, 0, errz.Err(err)
|
|
|
|
}
|
2023-04-02 22:49:45 +03:00
|
|
|
defer lg.WarnIfCloseError(log, lgm.CloseFileReader, r)
|
2020-08-23 13:42:15 +03:00
|
|
|
|
2022-12-18 02:11:33 +03:00
|
|
|
data, err := io.ReadAll(r)
|
2020-08-06 20:58:47 +03:00
|
|
|
if err != nil {
|
|
|
|
return source.TypeNone, 0, errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// We don't need to read all rows, one will do.
|
|
|
|
const rowLimit = 1
|
|
|
|
_, err = xlsx.OpenBinaryWithRowLimit(data, rowLimit)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return source.TypeNone, 0, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
return Type, 1.0, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Driver implements driver.Driver.
|
|
|
|
type Driver struct {
|
2023-04-02 22:49:45 +03:00
|
|
|
log *slog.Logger
|
2020-08-06 20:58:47 +03:00
|
|
|
scratcher driver.ScratchDatabaseOpener
|
|
|
|
files *source.Files
|
|
|
|
}
|
|
|
|
|
|
|
|
// DriverMetadata implements driver.Driver.
|
|
|
|
func (d *Driver) DriverMetadata() driver.Metadata {
|
|
|
|
return driver.Metadata{
|
|
|
|
Type: Type,
|
|
|
|
Description: "Microsoft Excel XLSX",
|
2022-12-18 11:35:59 +03:00
|
|
|
Doc: "https://en.wikipedia.org/wiki/Microsoft_Excel",
|
|
|
|
}
|
2020-08-06 20:58:47 +03:00
|
|
|
}
|
|
|
|
|
2023-04-08 21:09:27 +03:00
|
|
|
// Open implements driver.DatabaseOpener.
|
2020-08-06 20:58:47 +03:00
|
|
|
func (d *Driver) Open(ctx context.Context, src *source.Source) (driver.Database, error) {
|
2023-05-03 15:36:10 +03:00
|
|
|
lg.FromContext(ctx).Debug(lgm.OpenSrc, lga.Src, src)
|
2023-04-30 17:18:56 +03:00
|
|
|
|
2020-08-23 13:42:15 +03:00
|
|
|
r, err := d.files.Open(src)
|
2020-08-06 20:58:47 +03:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2023-04-02 22:49:45 +03:00
|
|
|
defer lg.WarnIfCloseError(d.log, lgm.CloseFileReader, r)
|
2020-08-06 20:58:47 +03:00
|
|
|
|
2022-12-18 02:11:33 +03:00
|
|
|
b, err := io.ReadAll(r)
|
2020-08-06 20:58:47 +03:00
|
|
|
if err != nil {
|
|
|
|
return nil, errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
xlFile, err := xlsx.OpenBinary(b)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
scratchDB, err := d.scratcher.OpenScratch(ctx, src.Handle)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
clnup := cleanup.New()
|
|
|
|
clnup.AddE(scratchDB.Close)
|
|
|
|
|
2023-04-02 22:49:45 +03:00
|
|
|
err = xlsxToScratch(ctx, src, xlFile, scratchDB)
|
2020-08-06 20:58:47 +03:00
|
|
|
if err != nil {
|
2023-04-02 22:49:45 +03:00
|
|
|
lg.WarnIfError(d.log, lgm.CloseDB, clnup.Run())
|
2020-08-06 20:58:47 +03:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return &database{log: d.log, src: src, impl: scratchDB, files: d.files, clnup: clnup}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Truncate implements driver.Driver.
|
2023-04-01 11:38:32 +03:00
|
|
|
func (d *Driver) Truncate(_ context.Context, src *source.Source, _ string, _ bool) (affected int64, err error) {
|
2020-08-06 20:58:47 +03:00
|
|
|
// TODO: WE could actually implement Truncate for xlsx.
|
|
|
|
// It would just mean deleting the rows from a sheet, and then
|
|
|
|
// saving the sheet.
|
2023-04-22 06:36:32 +03:00
|
|
|
return 0, errz.Errorf("driver type {%s} (%s) doesn't support dropping tables", Type, src.Handle)
|
2020-08-06 20:58:47 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// ValidateSource implements driver.Driver.
|
|
|
|
func (d *Driver) ValidateSource(src *source.Source) (*source.Source, error) {
|
2023-04-02 22:49:45 +03:00
|
|
|
d.log.Debug("Validating source: {%s}", src.RedactedLocation())
|
2020-08-06 20:58:47 +03:00
|
|
|
if src.Type != Type {
|
2023-04-22 06:36:32 +03:00
|
|
|
return nil, errz.Errorf("expected driver type {%s} but got {%s}", Type, src.Type)
|
2020-08-06 20:58:47 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return src, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ping implements driver.Driver.
|
2023-04-01 11:38:32 +03:00
|
|
|
func (d *Driver) Ping(_ context.Context, src *source.Source) (err error) {
|
2020-08-23 13:42:15 +03:00
|
|
|
r, err := d.files.Open(src)
|
2020-08-06 20:58:47 +03:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2023-04-02 22:49:45 +03:00
|
|
|
defer lg.WarnIfCloseError(d.log, lgm.CloseFileReader, r)
|
2020-08-06 20:58:47 +03:00
|
|
|
|
2022-12-18 02:11:33 +03:00
|
|
|
b, err := io.ReadAll(r)
|
2020-08-06 20:58:47 +03:00
|
|
|
if err != nil {
|
|
|
|
return errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
_, err = xlsx.OpenBinaryWithRowLimit(b, 1)
|
|
|
|
if err != nil {
|
|
|
|
return errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
2020-08-12 21:24:01 +03:00
|
|
|
|
|
|
|
// database implements driver.Database.
|
|
|
|
type database struct {
|
2023-04-02 22:49:45 +03:00
|
|
|
log *slog.Logger
|
2020-08-12 21:24:01 +03:00
|
|
|
src *source.Source
|
|
|
|
files *source.Files
|
|
|
|
impl driver.Database
|
|
|
|
clnup *cleanup.Cleanup
|
|
|
|
}
|
|
|
|
|
|
|
|
// DB implements driver.Database.
|
|
|
|
func (d *database) DB() *sql.DB {
|
|
|
|
return d.impl.DB()
|
|
|
|
}
|
|
|
|
|
|
|
|
// SQLDriver implements driver.Database.
|
|
|
|
func (d *database) SQLDriver() driver.SQLDriver {
|
|
|
|
return d.impl.SQLDriver()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Source implements driver.Database.
|
|
|
|
func (d *database) Source() *source.Source {
|
|
|
|
return d.src
|
|
|
|
}
|
|
|
|
|
|
|
|
// SourceMetadata implements driver.Database.
|
2022-12-18 09:07:38 +03:00
|
|
|
//
|
2020-11-02 20:40:29 +03:00
|
|
|
// TODO: the implementation of SourceMetadata is out
|
|
|
|
// of sync with the way we import data. For example, empty
|
|
|
|
// rows are filtered out during import, and empty columns
|
|
|
|
// are discarded. Thus SourceMetadata needs an overhaul to
|
|
|
|
// bring its reporting into line with import.
|
2023-06-22 08:48:58 +03:00
|
|
|
func (d *database) SourceMetadata(_ context.Context, noSchema bool) (*source.Metadata, error) {
|
2020-08-12 21:24:01 +03:00
|
|
|
meta := &source.Metadata{Handle: d.src.Handle}
|
|
|
|
|
|
|
|
var err error
|
|
|
|
meta.Size, err = d.files.Size(d.src)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
meta.Name, err = source.LocationFileName(d.src)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
meta.FQName = meta.Name
|
|
|
|
meta.Location = d.src.Location
|
2023-05-19 17:24:18 +03:00
|
|
|
meta.Driver = Type
|
2020-08-12 21:24:01 +03:00
|
|
|
|
|
|
|
b, err := d.files.ReadAll(d.src)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
xlFile, err := xlsx.OpenBinary(b)
|
|
|
|
if err != nil {
|
2023-04-02 22:49:45 +03:00
|
|
|
return nil, errz.Wrapf(err, "unable to open XLSX file: %s", d.src.Location)
|
2020-08-12 21:24:01 +03:00
|
|
|
}
|
|
|
|
|
2023-06-22 08:48:58 +03:00
|
|
|
if noSchema {
|
|
|
|
return meta, nil
|
|
|
|
}
|
2020-08-12 21:24:01 +03:00
|
|
|
|
2023-07-04 20:31:47 +03:00
|
|
|
hasHeader := driver.OptIngestHeader.Get(d.src.Options)
|
2020-08-12 21:24:01 +03:00
|
|
|
for _, sheet := range xlFile.Sheets {
|
2020-08-16 00:06:40 +03:00
|
|
|
tbl := &source.TableMetadata{Name: sheet.Name, RowCount: int64(len(sheet.Rows))}
|
2020-08-12 21:24:01 +03:00
|
|
|
|
|
|
|
if hasHeader && tbl.RowCount > 0 {
|
|
|
|
tbl.RowCount--
|
|
|
|
}
|
|
|
|
|
|
|
|
colNames := getColNames(sheet, hasHeader)
|
2020-11-02 20:40:29 +03:00
|
|
|
|
|
|
|
// TODO: Should move over to using kind.Detector
|
|
|
|
colTypes := getCellColumnTypes(sheet, hasHeader)
|
2020-08-12 21:24:01 +03:00
|
|
|
|
|
|
|
for i, colType := range colTypes {
|
|
|
|
col := &source.ColMetadata{}
|
|
|
|
col.BaseType = cellTypeToString(colType)
|
|
|
|
col.ColumnType = col.BaseType
|
|
|
|
col.Position = int64(i)
|
|
|
|
col.Name = colNames[i]
|
|
|
|
tbl.Columns = append(tbl.Columns, col)
|
|
|
|
}
|
|
|
|
|
|
|
|
meta.Tables = append(meta.Tables, tbl)
|
|
|
|
}
|
|
|
|
|
2023-06-22 08:48:58 +03:00
|
|
|
meta.TableCount = int64(len(meta.Tables))
|
|
|
|
|
2020-08-12 21:24:01 +03:00
|
|
|
return meta, nil
|
|
|
|
}
|
|
|
|
|
2022-12-18 09:07:38 +03:00
|
|
|
// TableMetadata implements driver.Database.
|
2023-04-01 11:38:32 +03:00
|
|
|
func (d *database) TableMetadata(_ context.Context, tblName string) (*source.TableMetadata, error) {
|
2022-12-18 09:07:38 +03:00
|
|
|
b, err := d.files.ReadAll(d.src)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errz.Err(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
xlFile, err := xlsx.OpenBinary(b)
|
|
|
|
if err != nil {
|
2023-04-02 22:49:45 +03:00
|
|
|
return nil, errz.Wrapf(err, "unable to open XLSX file: %s", d.src.Location)
|
2022-12-18 09:07:38 +03:00
|
|
|
}
|
|
|
|
|
2023-07-04 20:31:47 +03:00
|
|
|
hasHeader := driver.OptIngestHeader.Get(d.src.Options)
|
2022-12-18 09:07:38 +03:00
|
|
|
|
|
|
|
for _, sheet := range xlFile.Sheets {
|
|
|
|
if sheet.Name != tblName {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
tbl := &source.TableMetadata{Name: sheet.Name, RowCount: int64(len(sheet.Rows))}
|
|
|
|
|
|
|
|
if hasHeader && tbl.RowCount > 0 {
|
|
|
|
tbl.RowCount--
|
|
|
|
}
|
|
|
|
|
|
|
|
colNames := getColNames(sheet, hasHeader)
|
|
|
|
|
|
|
|
// TODO: Should move over to using kind.Detector
|
|
|
|
colTypes := getCellColumnTypes(sheet, hasHeader)
|
|
|
|
|
|
|
|
for i, colType := range colTypes {
|
|
|
|
col := &source.ColMetadata{}
|
|
|
|
col.BaseType = cellTypeToString(colType)
|
|
|
|
col.ColumnType = col.BaseType
|
|
|
|
col.Position = int64(i)
|
|
|
|
col.Name = colNames[i]
|
|
|
|
tbl.Columns = append(tbl.Columns, col)
|
|
|
|
}
|
|
|
|
|
|
|
|
return tbl, nil
|
|
|
|
}
|
|
|
|
|
2023-04-02 22:49:45 +03:00
|
|
|
return nil, errz.Errorf("table {%s} not found", tblName)
|
2022-12-18 09:07:38 +03:00
|
|
|
}
|
|
|
|
|
2020-08-12 21:24:01 +03:00
|
|
|
// Close implements driver.Database.
|
|
|
|
func (d *database) Close() error {
|
2023-05-03 15:36:10 +03:00
|
|
|
d.log.Debug(lgm.CloseDB, lga.Handle, d.src.Handle)
|
2020-08-12 21:24:01 +03:00
|
|
|
|
|
|
|
// No need to explicitly invoke c.impl.Close because
|
|
|
|
// that's already added to c.clnup
|
|
|
|
return d.clnup.Run()
|
|
|
|
}
|