sq/drivers/xlsx/xlsx_test.go

package xlsx_test

import (
	"context"
	"path/filepath"
	"testing"

	"github.com/neilotoole/sq/cli/testrun"

	"github.com/neilotoole/sq/libsq/driver"

	"github.com/neilotoole/sq/libsq/core/options"

	"github.com/neilotoole/sq/testh/tutil"

	"github.com/stretchr/testify/require"

	"github.com/neilotoole/sq/drivers/xlsx"
	"github.com/neilotoole/sq/libsq/source"
	"github.com/neilotoole/sq/testh"
	"github.com/neilotoole/sq/testh/proj"
	"github.com/neilotoole/sq/testh/sakila"
)

func Test_Smoke_Subset(t *testing.T) {
	th := testh.New(t, testh.OptLongOpen())
	src := th.Source(sakila.XLSXSubset)

	sink, err := th.QuerySQL(src, "SELECT * FROM actor")
	require.NoError(t, err)
	require.Equal(t, len(sakila.TblActorCols()), len(sink.RecMeta))
	require.Equal(t, sakila.TblActorCount, len(sink.Recs))
}

func Test_Smoke_Full(t *testing.T) {
	tutil.SkipShort(t, true)

	// This test fails (in GH workflow) on Windows without testh.OptLongOpen.
	// That's probably worth looking into further. It shouldn't be that slow,
	// even on Windows. However, we are going to rewrite the xlsx driver eventually,
	// so it can wait until then.
	// See: https://github.com/neilotoole/sq/issues/200
	th := testh.New(t, testh.OptLongOpen())
	src := th.Source(sakila.XLSX)

	sink, err := th.QuerySQL(src, "SELECT * FROM actor")
	require.NoError(t, err)
	require.Equal(t, len(sakila.TblActorCols()), len(sink.RecMeta))
	require.Equal(t, sakila.TblActorCount, len(sink.Recs))
}

func Test_XLSX_BadDateRecognition(t *testing.T) {
	t.Parallel()

	th := testh.New(t)

	src := &source.Source{
		Handle:   "@xlsx_bad_date",
		Type:     xlsx.Type,
		Location: proj.Abs("drivers/xlsx/testdata/problem_with_recognizing_date_colA.xlsx"),
		Options:  options.Options{driver.OptIngestHeader.Key(): true},
	}

	require.True(t, src.Options.IsSet(driver.OptIngestHeader))

	hasHeader := driver.OptIngestHeader.Get(src.Options)
	require.True(t, hasHeader)

	sink, err := th.QuerySQL(src, "SELECT * FROM Summary")
	require.NoError(t, err)
	require.Equal(t, 21, len(sink.Recs))
}

// TestHandleSomeEmptySheets verifies that sq can import XLSX
// when there are some empty sheets.
func TestHandleSomeEmptySheets(t *testing.T) {
	t.Parallel()

	th := testh.New(t)

	src := &source.Source{
		Handle:   "@xlsx_empty_sheets",
		Type:     xlsx.Type,
		Location: proj.Abs("drivers/xlsx/testdata/test_with_some_empty_sheets.xlsx"),
	}

	sink, err := th.QuerySQL(src, "SELECT * FROM Sheet1")
	require.NoError(t, err)
	require.Equal(t, 2, len(sink.Recs))
}

func TestIngestDuplicateColumns(t *testing.T) {
	actorDataRow0 := []string{"1", "PENELOPE", "GUINESS", "2020-02-15T06:59:28Z", "1"}

	ctx := context.Background()
	tr := testrun.New(ctx, t, nil).Hush()

	err := tr.Exec("add",
		"--handle", "@actor_dup",
		"--ingest.header=true",
		filepath.Join("testdata", "actor_duplicate_cols.xlsx"),
	)
	require.NoError(t, err)

	tr = testrun.New(ctx, t, tr)
	require.NoError(t, tr.Exec("--csv", ".actor"))
	wantHeaders := []string{"actor_id", "first_name", "last_name", "last_update", "actor_id_1"}
	data := tr.BindCSV()
	require.Equal(t, wantHeaders, data[0])

	// Make sure the data is correct
	require.Len(t, data, sakila.TblActorCount+1) // +1 for header row
	require.Equal(t, actorDataRow0, data[1])

	// Verify that changing the template works
	const tpl2 = "x_{{.Name}}{{with .Recurrence}}_{{.}}{{end}}"

	tr = testrun.New(ctx, t, tr)
	require.NoError(t, tr.Exec(
		"config",
		"set",
		driver.OptIngestColRename.Key(),
		tpl2,
	))
	tr = testrun.New(ctx, t, tr)
	require.NoError(t, tr.Exec("--csv", ".actor"))
	wantHeaders = []string{"x_actor_id", "x_first_name", "x_last_name", "x_last_update", "x_actor_id_1"}
	data = tr.BindCSV()
	require.Equal(t, wantHeaders, data[0])
}

func TestDetectHeaderRow(t *testing.T) {
	actorRows := [][]string{
		{"1", "PENELOPE", "GUINESS", "2020-02-15T06:59:28Z"},
		{"2", "NICK", "WAHLBERG", "2020-02-15T06:59:28Z"},
		{"3", "ED", "CHASE", "2020-02-15T06:59:28Z"},
	}
	abcd := []string{"A", "B", "C", "D"}

	testCases := []struct {
		filename        string
		wantRecordCount int
		matchRecords    [][]string
	}{
		{
			filename:        "actor_header.xlsx",
			wantRecordCount: sakila.TblActorCount + 1,
			matchRecords:    [][]string{sakila.TblActorCols(), actorRows[0], actorRows[1], actorRows[2]},
		},
		{
			filename:        "actor_no_header.xlsx",
			wantRecordCount: sakila.TblActorCount + 1,
			matchRecords:    [][]string{abcd, actorRows[0], actorRows[1], actorRows[2]},
		},
		{
			filename:        "actor_double_header.xlsx",
			wantRecordCount: sakila.TblActorCount + 3,
			matchRecords:    [][]string{abcd, sakila.TblActorCols(), sakila.TblActorCols(), actorRows[0]},
		},
	}

	for _, tc := range testCases {
		tc := tc
		t.Run(tc.filename, func(t *testing.T) {
			ctx := context.Background()
			fp := filepath.Join("testdata", tc.filename)

			tr := testrun.New(ctx, t, nil).Hush()
			err := tr.Exec("add", fp)
			require.NoError(t, err)

			tr = testrun.New(ctx, t, tr)
			require.NoError(t, tr.Exec("--csv", "--header", ".actor"))

			data := tr.BindCSV()

			for _, rec := range data {
				t.Log(rec)
			}

			require.Equal(t, tc.wantRecordCount, len(data))

			require.True(t, len(data) >= len(tc.matchRecords))
			for i, wantRec := range tc.matchRecords {
				gotRec := data[i]
				require.Equal(t, wantRec, gotRec, "record %d", i)
			}
		})
	}
}
codebase refactor 2020-08-06 20:58:47 +03:00			`package xlsx_test`

			`import (`
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`"context"`
			`"path/filepath"`
codebase refactor 2020-08-06 20:58:47 +03:00			`"testing"`

#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`"github.com/neilotoole/sq/cli/testrun"`

			`"github.com/neilotoole/sq/libsq/driver"`

Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00			`"github.com/neilotoole/sq/libsq/core/options"`

Linting of _test.go files (#121) * test linting * test linting * test linting * test linting 2022-12-18 10:18:35 +03:00			`"github.com/neilotoole/sq/testh/tutil"`

codebase refactor 2020-08-06 20:58:47 +03:00			`"github.com/stretchr/testify/require"`

			`"github.com/neilotoole/sq/drivers/xlsx"`
			`"github.com/neilotoole/sq/libsq/source"`
			`"github.com/neilotoole/sq/testh"`
			`"github.com/neilotoole/sq/testh/proj"`
			`"github.com/neilotoole/sq/testh/sakila"`
			`)`

			`func Test_Smoke_Subset(t *testing.T) {`
Make testh.OptLongOpen even longer 2023-05-26 08:02:11 +03:00			`th := testh.New(t, testh.OptLongOpen())`
codebase refactor 2020-08-06 20:58:47 +03:00			`src := th.Source(sakila.XLSXSubset)`

			`sink, err := th.QuerySQL(src, "SELECT * FROM actor")`
			`require.NoError(t, err)`
Switched pkg sakila "const" slices to functions (#57) * replaced sakila source slices with funcs * missed some pkg sakila slices to refactor 2020-08-10 18:16:44 +03:00			`require.Equal(t, len(sakila.TblActorCols()), len(sink.RecMeta))`
codebase refactor 2020-08-06 20:58:47 +03:00			`require.Equal(t, sakila.TblActorCount, len(sink.Recs))`
			`}`

			`func Test_Smoke_Full(t *testing.T) {`
fixed broken mysql tests (parseTime param); moved some test funcs to pkg tutil (#109) 2022-12-17 05:09:49 +03:00			`tutil.SkipShort(t, true)`
codebase refactor 2020-08-06 20:58:47 +03:00
Renamed testh.OptLongDB to testh.OptLongOpen; added it to other tests. 2023-04-19 17:08:26 +03:00			`// This test fails (in GH workflow) on Windows without testh.OptLongOpen.`
#199 - Config, refactoring (#204) * refactor: moved cli flags to pkg cli/flag * testh: add OptLongDB for long-running tests * implement 'sq config dir' * legacy dir migration: probably a bad idea * cleanup * Refactored SQ_CONFIG and --config * added yaml writer * Dialing in tests * YAML output for 'sq driver ls' * Significant refactoring of config * Minor test for ioz * Rename source.Set to source.Collection * Cleaning up references to source.Set 2023-04-19 08:28:09 +03:00			`// That's probably worth looking into further. It shouldn't be that slow,`
			`// even on Windows. However, we are going to rewrite the xlsx driver eventually,`
			`// so it can wait until then.`
			`// See: https://github.com/neilotoole/sq/issues/200`
Renamed testh.OptLongDB to testh.OptLongOpen; added it to other tests. 2023-04-19 17:08:26 +03:00			`th := testh.New(t, testh.OptLongOpen())`
codebase refactor 2020-08-06 20:58:47 +03:00			`src := th.Source(sakila.XLSX)`

			`sink, err := th.QuerySQL(src, "SELECT * FROM actor")`
			`require.NoError(t, err)`
Switched pkg sakila "const" slices to functions (#57) * replaced sakila source slices with funcs * missed some pkg sakila slices to refactor 2020-08-10 18:16:44 +03:00			`require.Equal(t, len(sakila.TblActorCols()), len(sink.RecMeta))`
codebase refactor 2020-08-06 20:58:47 +03:00			`require.Equal(t, sakila.TblActorCount, len(sink.Recs))`
			`}`

			`func Test_XLSX_BadDateRecognition(t *testing.T) {`
			`t.Parallel()`

			`th := testh.New(t)`

			`src := &source.Source{`
			`Handle: "@xlsx_bad_date",`
			`Type: xlsx.Type,`
			`Location: proj.Abs("drivers/xlsx/testdata/problem_with_recognizing_date_colA.xlsx"),`
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`Options: options.Options{driver.OptIngestHeader.Key(): true},`
codebase refactor 2020-08-06 20:58:47 +03:00			`}`

#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`require.True(t, src.Options.IsSet(driver.OptIngestHeader))`
Refactor config options (#209) * Refactor config and options. 2023-04-26 18:16:42 +03:00
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`hasHeader := driver.OptIngestHeader.Get(src.Options)`
codebase refactor 2020-08-06 20:58:47 +03:00			`require.True(t, hasHeader)`

			`sink, err := th.QuerySQL(src, "SELECT * FROM Summary")`
			`require.NoError(t, err)`
			`require.Equal(t, 21, len(sink.Recs))`
			`}`
Xlsx import handle empty sheets (#79) * can now import XLSX with empty sheets * renamed XLSX test article 2021-02-20 09:22:35 +03:00
			`// TestHandleSomeEmptySheets verifies that sq can import XLSX`
			`// when there are some empty sheets.`
			`func TestHandleSomeEmptySheets(t *testing.T) {`
			`t.Parallel()`

			`th := testh.New(t)`

			`src := &source.Source{`
			`Handle: "@xlsx_empty_sheets",`
			`Type: xlsx.Type,`
			`Location: proj.Abs("drivers/xlsx/testdata/test_with_some_empty_sheets.xlsx"),`
			`}`

			`sink, err := th.QuerySQL(src, "SELECT * FROM Sheet1")`
			`require.NoError(t, err)`
			`require.Equal(t, 2, len(sink.Recs))`
			`}`
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00
			`func TestIngestDuplicateColumns(t *testing.T) {`
#191: XLSX driver auto-detects header row (#284) * xlsx driver now detects header row. 2023-07-08 18:21:27 +03:00			`actorDataRow0 := []string{"1", "PENELOPE", "GUINESS", "2020-02-15T06:59:28Z", "1"}`

#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`ctx := context.Background()`
#191: XLSX driver auto-detects header row (#284) * xlsx driver now detects header row. 2023-07-08 18:21:27 +03:00			`tr := testrun.New(ctx, t, nil).Hush()`
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00
			`err := tr.Exec("add",`
			`"--handle", "@actor_dup",`
			`"--ingest.header=true",`
			`filepath.Join("testdata", "actor_duplicate_cols.xlsx"),`
			`)`
			`require.NoError(t, err)`

#191: XLSX driver auto-detects header row (#284) * xlsx driver now detects header row. 2023-07-08 18:21:27 +03:00			`tr = testrun.New(ctx, t, tr)`
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`require.NoError(t, tr.Exec("--csv", ".actor"))`
			`wantHeaders := []string{"actor_id", "first_name", "last_name", "last_update", "actor_id_1"}`
#191: XLSX driver auto-detects header row (#284) * xlsx driver now detects header row. 2023-07-08 18:21:27 +03:00			`data := tr.BindCSV()`
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`require.Equal(t, wantHeaders, data[0])`

			`// Make sure the data is correct`
			`require.Len(t, data, sakila.TblActorCount+1) // +1 for header row`
#191: XLSX driver auto-detects header row (#284) * xlsx driver now detects header row. 2023-07-08 18:21:27 +03:00			`require.Equal(t, actorDataRow0, data[1])`
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00
			`// Verify that changing the template works`
			`const tpl2 = "x_{{.Name}}{{with .Recurrence}}_{{.}}{{end}}"`

			`tr = testrun.New(ctx, t, tr)`
			`require.NoError(t, tr.Exec(`
			`"config",`
			`"set",`
			`driver.OptIngestColRename.Key(),`
			`tpl2,`
			`))`
			`tr = testrun.New(ctx, t, tr)`
			`require.NoError(t, tr.Exec("--csv", ".actor"))`
			`wantHeaders = []string{"x_actor_id", "x_first_name", "x_last_name", "x_last_update", "x_actor_id_1"}`
#191: XLSX driver auto-detects header row (#284) * xlsx driver now detects header row. 2023-07-08 18:21:27 +03:00			`data = tr.BindCSV()`
#99: Rename duplicate ingest headers (#283) * CSV now renames duplicate ingest headers * Fix broken test * xlsx ingester now handles duplicate col names * Update CHANGELOG * Additional tests for ingest.column.rename * Removed dead comment in grammar 2023-07-04 20:31:47 +03:00			`require.Equal(t, wantHeaders, data[0])`
			`}`
#191: XLSX driver auto-detects header row (#284) * xlsx driver now detects header row. 2023-07-08 18:21:27 +03:00
			`func TestDetectHeaderRow(t *testing.T) {`
			`actorRows := [][]string{`
			`{"1", "PENELOPE", "GUINESS", "2020-02-15T06:59:28Z"},`
			`{"2", "NICK", "WAHLBERG", "2020-02-15T06:59:28Z"},`
			`{"3", "ED", "CHASE", "2020-02-15T06:59:28Z"},`
			`}`
			`abcd := []string{"A", "B", "C", "D"}`

			`testCases := []struct {`
			`filename string`
			`wantRecordCount int`
			`matchRecords [][]string`
			`}{`
			`{`
			`filename: "actor_header.xlsx",`
			`wantRecordCount: sakila.TblActorCount + 1,`
			`matchRecords: [][]string{sakila.TblActorCols(), actorRows[0], actorRows[1], actorRows[2]},`
			`},`
			`{`
			`filename: "actor_no_header.xlsx",`
			`wantRecordCount: sakila.TblActorCount + 1,`
			`matchRecords: [][]string{abcd, actorRows[0], actorRows[1], actorRows[2]},`
			`},`
			`{`
			`filename: "actor_double_header.xlsx",`
			`wantRecordCount: sakila.TblActorCount + 3,`
			`matchRecords: [][]string{abcd, sakila.TblActorCols(), sakila.TblActorCols(), actorRows[0]},`
			`},`
			`}`

			`for _, tc := range testCases {`
			`tc := tc`
			`t.Run(tc.filename, func(t *testing.T) {`
			`ctx := context.Background()`
			`fp := filepath.Join("testdata", tc.filename)`

			`tr := testrun.New(ctx, t, nil).Hush()`
			`err := tr.Exec("add", fp)`
			`require.NoError(t, err)`

			`tr = testrun.New(ctx, t, tr)`
			`require.NoError(t, tr.Exec("--csv", "--header", ".actor"))`

			`data := tr.BindCSV()`

			`for _, rec := range data {`
			`t.Log(rec)`
			`}`

			`require.Equal(t, tc.wantRecordCount, len(data))`

			`require.True(t, len(data) >= len(tc.matchRecords))`
			`for i, wantRec := range tc.matchRecords {`
			`gotRec := data[i]`
			`require.Equal(t, wantRec, gotRec, "record %d", i)`
			`}`
			`})`
			`}`
			`}`