Excel Reading (XLSX) using SAX Default Handler (#10877)

- Implement Excel reading as a SAXMLParser.
This commit is contained in:
James Dunkerley 2024-12-13 21:01:16 +00:00 committed by GitHub
parent e6bcd5e485
commit 63ed629210
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 1267 additions and 229 deletions

View File

@ -49,7 +49,7 @@ type Excel_Workbook
- file: The file to load. - file: The file to load.
- xls_format: Whether to use the old XLS format (default is XLSX). - xls_format: Whether to use the old XLS format (default is XLSX).
new : File | Temporary_File -> Boolean -> Excel_Workbook new : File | Temporary_File -> Boolean -> Excel_Workbook
new file:(File | Temporary_File) xls_format=False = new file:(File | Temporary_File) xls_format:Boolean=False =
file_for_errors = if file.is_a Temporary_File then Nothing else file file_for_errors = if file.is_a Temporary_File then Nothing else file
continuation raw_file = continuation raw_file =
@ -73,7 +73,7 @@ type Excel_Workbook
- xls_format: Whether to use the old XLS format (default is XLSX). - xls_format: Whether to use the old XLS format (default is XLSX).
- file: Optional file reference. - file: Optional file reference.
from_stream : Input_Stream -> Boolean -> File | Nothing -> Excel_Workbook from_stream : Input_Stream -> Boolean -> File | Nothing -> Excel_Workbook
from_stream stream xls_format=False file=Nothing = Excel_Reader.handle_bad_format file <| from_stream stream xls_format:Boolean=False file=Nothing = Excel_Reader.handle_bad_format file <|
temp_file = Temporary_File.from_stream_light stream temp_file = Temporary_File.from_stream_light stream
Excel_Workbook.new temp_file xls_format Excel_Workbook.new temp_file xls_format
@ -89,8 +89,8 @@ type Excel_Workbook
## PRIVATE ## PRIVATE
ICON metadata ICON metadata
Returns the list of databases (or catalogs) for the connection. Returns the list of databases (or catalogs) for the connection.
databases : Nothing databases : Vector (Text | Nothing)
databases self = Nothing databases self = [Nothing]
## PRIVATE ## PRIVATE
ICON metadata ICON metadata
@ -109,7 +109,7 @@ type Excel_Workbook
Arguments: Arguments:
- database: The target file to open as an Excel_Workbook. - database: The target file to open as an Excel_Workbook.
set_database : Text | File -> Excel_Workbook ! Illegal_Argument set_database : Text | File -> Excel_Workbook ! Illegal_Argument
set_database self database = set_database self database:(Text | File) =
if database == self.database then self else if database == self.database then self else
file = File.new database file = File.new database
if file.exists && file.is_directory.not then Excel_Workbook.new file self.xls_format else if file.exists && file.is_directory.not then Excel_Workbook.new file self.xls_format else
@ -163,7 +163,7 @@ type Excel_Workbook
Gets the names of all the named ranges. Gets the names of all the named ranges.
named_ranges : Vector Text named_ranges : Vector Text
named_ranges self = self.with_java_workbook java_workbook-> named_ranges self = self.with_java_workbook java_workbook->
Vector.from_polyglot_array (ExcelReader.readRangeNames java_workbook) Vector.from_polyglot_array java_workbook.getRangeNames
## PRIVATE ## PRIVATE
ICON metadata ICON metadata

View File

@ -20,6 +20,7 @@ import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.enso.table.excel.xssfreader.XSSFReaderWorkbook;
public class ExcelConnectionPool { public class ExcelConnectionPool {
public static final ExcelConnectionPool INSTANCE = new ExcelConnectionPool(); public static final ExcelConnectionPool INSTANCE = new ExcelConnectionPool();
@ -64,7 +65,7 @@ public class ExcelConnectionPool {
record.refCount = 1; record.refCount = 1;
record.file = file; record.file = file;
record.format = format; record.format = format;
record.workbook = openWorkbook(file, format, false); record.reopen(true);
records.put(key, record); records.put(key, record);
return new ReadOnlyExcelConnection(this, key, record); return new ReadOnlyExcelConnection(this, key, record);
} }
@ -212,10 +213,10 @@ public class ExcelConnectionPool {
private int refCount; private int refCount;
private File file; private File file;
private ExcelFileFormat format; private ExcelFileFormat format;
private Workbook workbook; private ExcelWorkbook workbook;
private IOException initializationException = null; private IOException initializationException = null;
<T> T withWorkbook(Function<Workbook, T> action) throws IOException { <T> T withWorkbook(Function<ExcelWorkbook, T> action) throws IOException {
synchronized (this) { synchronized (this) {
return action.apply(accessCurrentWorkbook()); return action.apply(accessCurrentWorkbook());
} }
@ -238,7 +239,10 @@ public class ExcelConnectionPool {
} }
try { try {
workbook = openWorkbook(file, format, false); workbook =
format == ExcelFileFormat.XLSX
? new XSSFReaderWorkbook(file.getAbsolutePath())
: ExcelWorkbook.forPOIUserModel(openWorkbook(file, format, false));
} catch (IOException e) { } catch (IOException e) {
initializationException = e; initializationException = e;
if (throwOnFailure) { if (throwOnFailure) {
@ -248,7 +252,7 @@ public class ExcelConnectionPool {
} }
} }
private Workbook accessCurrentWorkbook() throws IOException { private ExcelWorkbook accessCurrentWorkbook() throws IOException {
synchronized (this) { synchronized (this) {
if (workbook == null) { if (workbook == null) {
if (initializationException != null) { if (initializationException != null) {
@ -278,7 +282,7 @@ public class ExcelConnectionPool {
throw e; throw e;
} }
} }
case XLSX -> { case XLSX, XLSX_FALLBACK -> {
try { try {
PackageAccess access = writeAccess ? PackageAccess.READ_WRITE : PackageAccess.READ; PackageAccess access = writeAccess ? PackageAccess.READ_WRITE : PackageAccess.READ;
OPCPackage pkg = OPCPackage.open(file, access); OPCPackage pkg = OPCPackage.open(file, access);
@ -300,7 +304,7 @@ public class ExcelConnectionPool {
private static Workbook createEmptyWorkbook(ExcelFileFormat format) { private static Workbook createEmptyWorkbook(ExcelFileFormat format) {
return switch (format) { return switch (format) {
case XLS -> new HSSFWorkbook(); case XLS -> new HSSFWorkbook();
case XLSX -> new XSSFWorkbook(); case XLSX, XLSX_FALLBACK -> new XSSFWorkbook();
}; };
} }

View File

@ -2,5 +2,6 @@ package org.enso.table.excel;
public enum ExcelFileFormat { public enum ExcelFileFormat {
XLS, XLS,
XLSX XLSX,
XLSX_FALLBACK
} }

View File

@ -57,7 +57,7 @@ public class ExcelHeaders {
String[] output = new String[currentEndCol - startCol + 1]; String[] output = new String[currentEndCol - startCol + 1];
for (int col = startCol; col <= currentEndCol; col++) { for (int col = startCol; col <= currentEndCol; col++) {
String cellText = row.getFormattedCell(col); String cellText = row.getCellText(col);
String name = cellText.isEmpty() ? "" : deduplicator.makeUnique(cellText); String name = cellText.isEmpty() ? "" : deduplicator.makeUnique(cellText);
output[col - startCol] = name; output[col - startCol] = name;

View File

@ -197,7 +197,7 @@ public class ExcelRange {
Context context = Context.getCurrent(); Context context = Context.getCurrent();
while (currentRow != null && !currentRow.isEmpty(excelRange.getLeftColumn(), rightColumn)) { while (currentRow != null && !currentRow.isEmpty(excelRange.getLeftColumn(), rightColumn)) {
rightColumn = currentRow.findEndRight(rightColumn); rightColumn = findEndRight(currentRow, rightColumn);
bottomRow++; bottomRow++;
currentRow = sheet.get(bottomRow); currentRow = sheet.get(bottomRow);
@ -212,6 +212,16 @@ public class ExcelRange {
bottomRow - 1); bottomRow - 1);
} }
private static int findEndRight(ExcelRow row, int start) {
Context context = Context.getCurrent();
int column = start;
while (!row.isEmpty(column + 1)) {
column++;
context.safepoint();
}
return column;
}
/** /**
* @param index The index to the next character after the parsed value * @param index The index to the next character after the parsed value
* @param value Parsed integer value or 0 if not valid * @param value Parsed integer value or 0 if not valid

View File

@ -10,114 +10,51 @@ import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.ExcelNumberFormat; import org.apache.poi.ss.usermodel.ExcelNumberFormat;
import org.apache.poi.ss.usermodel.FormulaError; import org.apache.poi.ss.usermodel.FormulaError;
import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.graalvm.polyglot.Context; import org.graalvm.polyglot.Context;
/** Wrapper class to handle Excel rows. */ /** Wrapper class to handle Excel rows. */
public class ExcelRow { public interface ExcelRow {
private static final DataFormatter formatter = new DataFormatter(); /** Gets the initial column index within the row (1-based). */
int getFirstColumn();
private final Row row; /** Gets the final column index within the row (1-based). */
private final int firstColumn; int getLastColumn();
private final int lastColumn;
private final boolean use1904Format;
public ExcelRow(Row row, boolean use1904Format) { /** Gets the cell at the given index within the row (1-based). */
this.row = row; Object getCellValue(int column);
this.firstColumn = row.getFirstCellNum() + 1;
this.lastColumn = row.getLastCellNum(); /** Gets the text of a cell at the given index within the row (1-based). */
this.use1904Format = use1904Format; String getCellText(int column);
/** Gets the cell at the given index within the row (1-based). */
Cell get(int column);
/** Checks if the specified cell is empty. */
boolean isEmpty(int column);
/** Checks if the specified set of cells are empty. */
boolean isEmpty(int start, int end);
/** Gets the cells as text. */
String[] getCellsAsText(int startCol, int endCol);
/** Gets the underlying Apache POI Sheet object. */
static ExcelRow forPOIUserModel(Sheet sheet, int rowIndex, boolean use1904Format) {
var row = sheet.getRow(rowIndex - 1);
return row == null
? null
: new ExcelRowFromPOIUserModel(
row, row.getFirstCellNum() + 1, row.getLastCellNum(), use1904Format);
} }
public int getFirstColumn() { static boolean isEmptyHelper(ExcelRow row, int start, int end) {
return firstColumn;
}
public int getLastColumn() {
return lastColumn;
}
public Cell get(int column) {
return (column < firstColumn || column > lastColumn) ? null : row.getCell(column - 1);
}
public Object getCellValue(int column) {
Cell cell = get(column);
CellType cellType = getCellType(cell);
switch (cellType) {
case NUMERIC:
double dblValue = cell.getNumericCellValue();
var nf = ExcelNumberFormat.from(cell, null);
if (nf != null && DateUtil.isADateFormat(nf.getIdx(), nf.getFormat())) {
var temporal =
use1904Format
? ExcelUtils.fromExcelDateTime1904(dblValue)
: ExcelUtils.fromExcelDateTime(dblValue);
if (temporal == null) {
return null;
}
return switch (temporal) {
case LocalDate date -> {
var dateFormat = cell.getCellStyle().getDataFormatString();
yield (dateFormat.contains("h") || dateFormat.contains("H"))
? date.atStartOfDay(ZoneId.systemDefault())
: date;
}
case ZonedDateTime zdt -> {
if (!use1904Format || zdt.getYear() != 1904 || zdt.getDayOfYear() != 1) {
yield temporal;
}
var dateFormat = cell.getCellStyle().getDataFormatString();
yield (dateFormat.contains("y")
|| dateFormat.contains("M")
|| dateFormat.contains("d"))
? zdt
: zdt.toLocalTime();
}
default -> temporal;
};
} else {
if (dblValue == (long) dblValue) {
return (long) dblValue;
} else {
return dblValue;
}
}
case STRING:
return cell.getStringCellValue();
case BOOLEAN:
return cell.getBooleanCellValue();
default:
return null;
}
}
public static CellType getCellType(Cell cell) {
if (cell == null) {
return CellType._NONE;
}
CellType cellType = cell.getCellType();
if (cellType == CellType.FORMULA) {
cellType = cell.getCachedFormulaResultType();
}
return cellType;
}
public boolean isEmpty(int column) {
CellType cellType = getCellType(get(column));
return (cellType == CellType._NONE) || (cellType == CellType.BLANK);
}
public boolean isEmpty(int start, int end) {
Context context = Context.getCurrent(); Context context = Context.getCurrent();
int currentEnd = end == -1 ? getLastColumn() : end; int currentEnd = end == -1 ? row.getLastColumn() : end;
for (int column = Math.max(getFirstColumn(), start); for (int column = Math.max(row.getFirstColumn(), start);
column <= Math.min(getLastColumn(), currentEnd); column <= Math.min(row.getLastColumn(), currentEnd);
column++) { column++) {
if (!isEmpty(column)) { if (!row.isEmpty(column)) {
return false; return false;
} }
@ -126,63 +63,144 @@ public class ExcelRow {
return true; return true;
} }
public int findEndRight(int start) { record ExcelRowFromPOIUserModel(Row row, int firstColumn, int lastColumn, boolean use1904Format)
Context context = Context.getCurrent(); implements ExcelRow {
int column = start; private static final DataFormatter formatter = new DataFormatter();
while (!isEmpty(column + 1)) {
column++;
context.safepoint();
}
return column;
}
/** Returns the formatted cell value. */ public int getFirstColumn() {
public String getFormattedCell(int col) { return firstColumn;
var cell = get(col);
if (cell == null) {
return "";
} }
var rawCellType = cell.getCellType(); public int getLastColumn() {
var cellType = return lastColumn;
rawCellType == CellType.FORMULA ? cell.getCachedFormulaResultType() : rawCellType;
return switch (cellType) {
case ERROR ->
// Want to show the error message rather than empty.
FormulaError.forInt(cell.getErrorCellValue()).getString();
case NUMERIC -> {
// Special handling for Number or Date cells as want to keep formatting.
var format = ExcelNumberFormat.from(cell, null);
var value = cell.getNumericCellValue();
yield format == null
? Double.toString(value)
: formatter.formatRawCellContents(value, format.getIdx(), format.getFormat());
}
default -> {
// Use the default read and then toString.
var value = getCellValue(col);
yield value == null ? "" : value.toString();
}
};
}
public String[] getCellsAsText(int startCol, int endCol) {
Context context = Context.getCurrent();
int currentEndCol = endCol == -1 ? getLastColumn() : endCol;
String[] output = new String[currentEndCol - startCol + 1];
for (int col = startCol; col <= currentEndCol; col++) {
Cell cell = get(col);
CellType type = ExcelRow.getCellType(cell);
if (type != CellType._NONE && type != CellType.BLANK && type != CellType.STRING) {
return null;
}
output[col - startCol] =
type == CellType.STRING && cell != null ? cell.getStringCellValue() : "";
context.safepoint();
} }
return output; public Cell get(int column) {
return (column < firstColumn || column > lastColumn) ? null : row.getCell(column - 1);
}
public Object getCellValue(int column) {
Cell cell = get(column);
CellType cellType = getCellType(cell);
switch (cellType) {
case NUMERIC:
double dblValue = cell.getNumericCellValue();
var nf = ExcelNumberFormat.from(cell, null);
if (nf != null && DateUtil.isADateFormat(nf.getIdx(), nf.getFormat())) {
var temporal =
use1904Format
? ExcelUtils.fromExcelDateTime1904(dblValue)
: ExcelUtils.fromExcelDateTime(dblValue);
if (temporal == null) {
return null;
}
return switch (temporal) {
case LocalDate date -> {
var dateFormat = cell.getCellStyle().getDataFormatString();
yield (dateFormat.contains("h") || dateFormat.contains("H"))
? date.atStartOfDay(ZoneId.systemDefault())
: date;
}
case ZonedDateTime zdt -> {
if (!use1904Format || zdt.getYear() != 1904 || zdt.getDayOfYear() != 1) {
yield temporal;
}
var dateFormat = cell.getCellStyle().getDataFormatString();
yield (dateFormat.contains("y")
|| dateFormat.contains("M")
|| dateFormat.contains("d"))
? zdt
: zdt.toLocalTime();
}
default -> temporal;
};
} else {
if (dblValue == (long) dblValue) {
return (long) dblValue;
} else {
return dblValue;
}
}
case STRING:
return cell.getStringCellValue();
case BOOLEAN:
return cell.getBooleanCellValue();
default:
return null;
}
}
public String getCellText(int column) {
var cell = get(column);
if (cell == null) {
return "";
}
var rawCellType = cell.getCellType();
var cellType =
rawCellType == CellType.FORMULA ? cell.getCachedFormulaResultType() : rawCellType;
return switch (cellType) {
case ERROR ->
// Want to show the error message rather than empty.
FormulaError.forInt(cell.getErrorCellValue()).getString();
case NUMERIC -> {
// Special handling for Number or Date cells as want to keep formatting.
var format = ExcelNumberFormat.from(cell, null);
var value = cell.getNumericCellValue();
yield format == null
? Double.toString(value)
: formatter.formatRawCellContents(value, format.getIdx(), format.getFormat());
}
default -> {
// Use the default read and then toString.
var value = getCellValue(column);
yield value == null ? "" : value.toString();
}
};
}
public boolean isEmpty(int column) {
CellType cellType = getCellType(get(column));
return (cellType == CellType._NONE) || (cellType == CellType.BLANK);
}
public boolean isEmpty(int start, int end) {
return isEmptyHelper(this, start, end);
}
public String[] getCellsAsText(int startCol, int endCol) {
Context context = Context.getCurrent();
int currentEndCol = endCol == -1 ? getLastColumn() : endCol;
String[] output = new String[currentEndCol - startCol + 1];
for (int col = startCol; col <= currentEndCol; col++) {
Cell cell = get(col);
CellType type = getCellType(cell);
if (type != CellType._NONE && type != CellType.BLANK && type != CellType.STRING) {
return null;
}
output[col - startCol] =
type == CellType.STRING && cell != null ? cell.getStringCellValue() : "";
context.safepoint();
}
return output;
}
private static CellType getCellType(Cell cell) {
if (cell == null) {
return CellType._NONE;
}
CellType cellType = cell.getCellType();
if (cellType == CellType.FORMULA) {
cellType = cell.getCachedFormulaResultType();
}
return cellType;
}
} }
} }

View File

@ -1,37 +1,83 @@
package org.enso.table.excel; package org.enso.table.excel;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.Workbook;
/** Wrapper class to handle Excel sheets. */ /** Wrapper class to handle Excel sheets. */
public class ExcelSheet { public interface ExcelSheet {
private final Sheet sheet; /** Gets the index of the sheet within the workbook (0-based). */
private final int firstRow; int getSheetIndex();
private final int lastRow;
private final boolean use1904Format;
public ExcelSheet(Workbook workbook, int sheetIndex) { /** Gets the name of the sheet. */
this.sheet = workbook.getSheetAt(sheetIndex); String getName();
this.firstRow = sheet.getFirstRowNum() + 1;
this.lastRow = sheet.getLastRowNum() + 1; /** Gets the initial row index within the sheet (1-based). */
this.use1904Format = ExcelUtils.is1904DateSystem(workbook); int getFirstRow();
/** Gets the final row index within the sheet (1-based). */
int getLastRow();
/**
* Gets the row at the given index within the sheet (1-based)
*
* @param row the row index (1-based)/
* @return the row object or null if the row index is out of range or doesn't exist.
*/
ExcelRow get(int row);
/** Gets the underlying Apache POI Sheet object - may be null. Provided for Writer use only. */
Sheet getSheet();
/** Gets the underlying Apache POI Sheet object. */
static ExcelSheet forPOIUserModel(Workbook workbook, int sheetIndex) {
var sheet = workbook.getSheetAt(sheetIndex);
return new ExcelSheetFromPOIUserModel(
sheet,
sheetIndex,
sheet.getSheetName(),
sheet.getFirstRowNum() + 1,
sheet.getLastRowNum() + 1,
ExcelUtils.is1904DateSystem(workbook));
} }
public int getLastRow() { record ExcelSheetFromPOIUserModel(
return lastRow; Sheet sheet,
} int sheetIndex,
String sheetName,
int firstRow,
int lastRow,
boolean use1904Format)
implements ExcelSheet {
@Override
public int getSheetIndex() {
return sheetIndex;
}
public int getFirstRow() { @Override
return firstRow; public String getName() {
} return sheetName;
}
public ExcelRow get(int row) { @Override
Row underlyingRow = row < firstRow || row > lastRow ? null : sheet.getRow(row - 1); public int getFirstRow() {
return underlyingRow == null ? null : new ExcelRow(underlyingRow, use1904Format); return firstRow;
} }
public Sheet getSheet() { @Override
return sheet; public int getLastRow() {
return lastRow;
}
@Override
public ExcelRow get(int row) {
return row < firstRow || row > lastRow
? null
: ExcelRow.forPOIUserModel(sheet, row, use1904Format);
}
@Override
public Sheet getSheet() {
return sheet;
}
} }
} }

View File

@ -1,6 +1,10 @@
package org.enso.table.excel; package org.enso.table.excel;
import java.time.*; import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.time.temporal.ChronoUnit; import java.time.temporal.ChronoUnit;
import java.time.temporal.Temporal; import java.time.temporal.Temporal;
import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.Workbook;

View File

@ -0,0 +1,123 @@
package org.enso.table.excel;
import java.io.IOException;
import org.apache.poi.ss.usermodel.Name;
/** Represents an Excel workbook. Wraps the underlying Apache POI Workbook object. */
public interface ExcelWorkbook {
/**
* Get the number of spreadsheets in the workbook
*
* @return the number of sheets
*/
int getNumberOfSheets();
/**
* Returns the index of the sheet by its name
*
* @param name the sheet name
* @return index of the sheet (0 based)
*/
int getSheetIndex(String name);
/**
* Get the sheet name
*
* @param sheet sheet number (0 based)
* @return Sheet name
*/
String getSheetName(int sheet);
/**
* @return the total number of defined names in this workbook
*/
int getNumberOfNames();
/**
* Get all the range names in the workbook
*
* @return an array of range names
*/
String[] getRangeNames();
/**
* Get the formula for a named range.
*
* @param name the name of the range.
* @return the formula for the range or null if not found.
*/
String getNameFormula(String name);
/**
* Get a sheet by its index
*
* @param sheetIndex the index of the sheet (0 based)
* @return the sheet as an ExcelSheet object
* @throws IllegalArgumentException if the sheet index is out of range.
*/
ExcelSheet getSheetAt(int sheetIndex);
/**
* Close the underlying input resource (File or Stream), from which the Workbook was read.
*
* <p>Once this has been called, no further operations, updates or reads should be performed on
* the Workbook.
*/
void close() throws IOException;
/**
* Create an ExcelWorkbook object from an Apache POI Workbook object
*
* @param workbook the Apache POI Workbook object
* @return the ExcelWorkbook object
*/
static ExcelWorkbook forPOIUserModel(org.apache.poi.ss.usermodel.Workbook workbook) {
return new ExcelWorkbookFromPOIUserModel(workbook);
}
// ** Wrap a Workbook object in the interface. */
record ExcelWorkbookFromPOIUserModel(org.apache.poi.ss.usermodel.Workbook workbook)
implements ExcelWorkbook {
@Override
public int getNumberOfSheets() {
return workbook.getNumberOfSheets();
}
@Override
public int getSheetIndex(String name) {
return workbook.getSheetIndex(name);
}
@Override
public String getSheetName(int sheet) {
return workbook.getSheetName(sheet);
}
@Override
public int getNumberOfNames() {
return workbook.getNumberOfNames();
}
@Override
public String[] getRangeNames() {
var names = workbook.getAllNames();
return names.stream().map(Name::getNameName).toArray(String[]::new);
}
@Override
public String getNameFormula(String name) {
var namedRange = workbook.getName(name);
return namedRange == null ? null : namedRange.getRefersToFormula();
}
@Override
public ExcelSheet getSheetAt(int sheetIndex) {
return ExcelSheet.forPOIUserModel(workbook, sheetIndex);
}
@Override
public void close() throws IOException {
workbook.close();
}
}
}

View File

@ -2,7 +2,6 @@ package org.enso.table.excel;
import java.io.IOException; import java.io.IOException;
import java.util.function.Function; import java.util.function.Function;
import org.apache.poi.ss.usermodel.Workbook;
public class ReadOnlyExcelConnection implements AutoCloseable { public class ReadOnlyExcelConnection implements AutoCloseable {
@ -28,7 +27,7 @@ public class ReadOnlyExcelConnection implements AutoCloseable {
record = null; record = null;
} }
public synchronized <T> T withWorkbook(Function<Workbook, T> f) throws IOException { public synchronized <T> T withWorkbook(Function<ExcelWorkbook, T> f) throws IOException {
if (record == null) { if (record == null) {
throw new IllegalStateException("ReadOnlyExcelConnection is being used after it was closed."); throw new IllegalStateException("ReadOnlyExcelConnection is being used after it was closed.");
} }

View File

@ -0,0 +1,29 @@
package org.enso.table.excel.xssfreader;
import java.util.HashMap;
import java.util.Map;
import org.apache.poi.xssf.model.StylesTable;
/** Provides the format strings for number formats in an XSSF workbook. */
public class XSSFReaderFormats {
private final StylesTable stylesTable;
private final Map<Short, String> numberFormats = new HashMap<>();
public XSSFReaderFormats(StylesTable stylesTable) {
this.stylesTable = stylesTable;
}
public String getNumberFormatAt(short styleIdx) {
if (numberFormats.containsKey(styleIdx)) {
return numberFormats.get(styleIdx);
}
var style = stylesTable.getStyleAt(styleIdx);
var format = style == null ? "General" : style.getDataFormatString();
if (format == null || format.equals("General")) {
format = "";
}
numberFormats.put(styleIdx, format);
return format;
}
}

View File

@ -0,0 +1,125 @@
package org.enso.table.excel.xssfreader;
import java.time.LocalDateTime;
import java.util.SortedMap;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.enso.table.excel.ExcelRow;
public class XSSFReaderRow implements ExcelRow {
private static final DataFormatter formatter = new DataFormatter();
private final SortedMap<Short, XSSFReaderSheetXMLHandler.CellValue> data;
private final boolean use1904Dates;
public XSSFReaderRow(
SortedMap<Short, XSSFReaderSheetXMLHandler.CellValue> data, boolean use1904Dates) {
this.data = data;
this.use1904Dates = use1904Dates;
}
@Override
public int getFirstColumn() {
return data.firstKey();
}
@Override
public int getLastColumn() {
return data.lastKey();
}
@Override
public Cell get(int column) {
// Not supported as we don't have the underlying Apache POI Cell object.
throw new UnsupportedOperationException("XSSFReader does not support getting the Cell object.");
}
@Override
public Object getCellValue(int column) {
var cell = data.get((short) column);
if (cell == null) {
return null;
}
var dataType = cell.dataType();
return switch (dataType) {
case BLANK -> null;
case BOOL -> cell.getBooleanValue();
case DATE -> LocalDateTime.parse(cell.strValue()); // Don't believe used by Excel.
case INLINE_STRING, SST_STRING, FORMULA_STRING -> cell.strValue();
case INTEGER -> cell.getIntegerValue();
case NUMBER -> {
double dbl = cell.getNumberValue();
long longVal = (long) dbl;
if (dbl == longVal) {
yield (long) dbl;
} else {
yield dbl;
}
}
case OLE_DATE -> cell.getDateValue(use1904Dates);
case OLE_DATETIME -> cell.getDateTimeValue(use1904Dates);
case ERROR -> null;
};
}
@Override
public String getCellText(int column) {
var cell = data.get((short) column);
if (cell == null) {
return "";
}
var dataType = cell.dataType();
return switch (dataType) {
case BLANK -> "";
case NUMBER, OLE_DATETIME, OLE_DATE, INTEGER -> {
// Special handling for Number or Date cells as want to keep formatting.
var formatText = cell.format();
if (formatText == null || formatText.isEmpty()) {
yield cell.strValue();
}
yield formatter.formatRawCellContents(cell.getNumberValue(), -1, formatText, use1904Dates);
}
case BOOL -> cell.getBooleanValue() ? "TRUE" : "FALSE";
default -> cell.strValue();
};
}
@Override
public boolean isEmpty(int column) {
var cell = data.get((short) column);
return cell == null || cell.strValue().isEmpty();
}
@Override
public boolean isEmpty(int start, int end) {
int currentEnd = end == -1 ? getLastColumn() : end;
for (int column = Math.max(getFirstColumn(), start);
column <= Math.min(getLastColumn(), currentEnd);
column++) {
if (!isEmpty(column)) {
return false;
}
}
return true;
}
@Override
public String[] getCellsAsText(int startCol, int endCol) {
int currentEndCol = endCol == -1 ? getLastColumn() : endCol;
String[] output = new String[currentEndCol - startCol + 1];
for (int col = startCol; col <= currentEndCol; col++) {
var cell = data.get((short) col);
if (cell != null && !cell.dataType().isString()) {
// Short circuit if find not a string cell.
return null;
}
output[col - startCol] = cell == null ? "" : cell.strValue();
}
return output;
}
}

View File

@ -0,0 +1,150 @@
package org.enso.table.excel.xssfreader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.util.XMLHelper;
import org.enso.table.excel.ExcelRow;
import org.enso.table.excel.ExcelSheet;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class XSSFReaderSheet implements ExcelSheet {
private final int sheetIdx;
private final String sheetName;
private final String relId;
private final XSSFReaderWorkbook parent;
private boolean hasReadSheetData = false;
private String dimensions;
private int firstRow;
private int lastRow;
private Map<Integer, SortedMap<Short, XSSFReaderSheetXMLHandler.CellValue>> rowData;
public XSSFReaderSheet(int sheetIdx, String sheetName, String relId, XSSFReaderWorkbook parent) {
this.sheetIdx = sheetIdx;
this.sheetName = sheetName;
this.relId = relId;
this.parent = parent;
}
private synchronized void ensureReadSheetData() {
if (hasReadSheetData) {
return;
}
try {
var strings = parent.getSharedStrings();
var styles = parent.getStyles();
var handler =
new XSSFReaderSheetXMLHandler(styles, strings) {
@Override
protected void onDimensions(String dimension) {
handleOnDimensions(dimension);
}
@Override
protected void onStartRow(int rowNum) {
handleOnStartRow(rowNum);
}
@Override
protected void onCell(int rowNumber, short columnNumber, String ref, CellValue value) {
handleOnCell(rowNumber, columnNumber, value);
}
};
var xmlReader = XMLHelper.newXMLReader();
xmlReader.setContentHandler(handler);
rowData = new HashMap<>();
try {
parent.withReader(
reader -> {
try {
var sheet = reader.getSheet(relId);
xmlReader.parse(new InputSource(sheet));
} catch (SAXException | InvalidFormatException | IOException e) {
throw new RuntimeException(e);
}
});
} catch (IOException e) {
throw new RuntimeException(e);
}
hasReadSheetData = true;
} catch (SAXException | ParserConfigurationException e) {
throw new RuntimeException(e);
}
}
@Override
public int getSheetIndex() {
return sheetIdx;
}
@Override
public String getName() {
return sheetName;
}
public String getDimensions() {
ensureReadSheetData();
return dimensions;
}
@Override
public int getFirstRow() {
ensureReadSheetData();
return firstRow;
}
@Override
public int getLastRow() {
ensureReadSheetData();
return lastRow;
}
@Override
public ExcelRow get(int row) {
ensureReadSheetData();
if (!rowData.containsKey(row)) {
return null;
}
return new XSSFReaderRow(rowData.get(row), parent.use1904Format());
}
@Override
public Sheet getSheet() {
// Not supported as we don't have the underlying Apache POI Sheet object.
throw new UnsupportedOperationException(
"XSSFReader does not support getting the Sheet object.");
}
protected void handleOnDimensions(String dimension) {
dimensions = dimension;
}
private void handleOnStartRow(int rowNum) {
if (firstRow == 0 || rowNum < firstRow) {
firstRow = rowNum;
}
if (lastRow == 0 || rowNum > lastRow) {
lastRow = rowNum;
}
}
private void handleOnCell(
int rowNumber, short columnNumber, XSSFReaderSheetXMLHandler.CellValue value) {
rowData.computeIfAbsent(rowNumber, k -> new TreeMap<>()).put(columnNumber, value);
}
}

View File

@ -0,0 +1,259 @@
package org.enso.table.excel.xssfreader;
import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML;
import java.time.ZonedDateTime;
import java.time.temporal.Temporal;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.xssf.model.SharedStrings;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.enso.table.excel.ExcelUtils;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
/** Based on the XSSFSheetXMLHandler class from Apache POI. */
/**
* SAX-based Handler to Read Excel XML on top of POI support. Technical specification can be found
* at:
* https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oe376/db9b9b72-b10b-4e7e-844c-09f88c972219
* https://ecma-international.org/publications-and-standards/standards/ecma-376/
*/
public class XSSFReaderSheetXMLHandler extends DefaultHandler {
private final XSSFReaderFormats styles;
private final SharedStrings sharedStrings;
public enum XSSDataType {
BLANK,
BOOL,
DATE,
ERROR,
INLINE_STRING,
SST_STRING,
NUMBER,
INTEGER,
OLE_DATE,
OLE_DATETIME,
FORMULA_STRING;
public boolean isString() {
return this == INLINE_STRING || this == SST_STRING || this == FORMULA_STRING;
}
}
// Record if seen a value element
private boolean seenValue;
// Set when V start element is seen
private boolean vIsOpen;
// Set when an Inline String "is" is seen
private boolean isIsOpen;
// The current row being read (or -1 if not in a row)
private int rowNumber = -1;
// Handle missing rowNumber in the XML (happens in Excel), first row would be row 1.
private int nextRowNumber = 1;
// The current cell being read (or null if not in a cell)
private String cellRef;
// Set when cell start element is seen, used when cell close element is seen.
private XSSDataType dataType;
// Gathers characters as they are seen.
private final StringBuilder value = new StringBuilder(64);
private String numberFormat = null;
public XSSFReaderSheetXMLHandler(XSSFReaderFormats styles, SharedStrings strings) {
this.styles = styles;
this.sharedStrings = strings;
}
private boolean isTextTag(String name) {
return "v".equals(name) || "inlineStr".equals(name) || ("t".equals(name) && isIsOpen);
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) {
if (uri != null && !NS_SPREADSHEETML.equals(uri)) {
return;
}
if (isTextTag(localName)) {
seenValue = true;
vIsOpen = true;
if (!isIsOpen) {
value.setLength(0);
}
} else {
switch (localName) {
case "dimension": // Dimensions of sheet
var dimension = attributes.getValue("ref");
if (dimension != null) {
onDimensions(dimension);
}
break;
case "row": // Row
String rowNumStr = attributes.getValue("r");
rowNumber = rowNumStr == null ? nextRowNumber : Integer.parseInt(rowNumStr);
onStartRow(rowNumber);
break;
case "c": // Cell
cellRef = attributes.getValue("r");
seenValue = false;
String cellType = attributes.getValue("t");
if (cellType == null) {
cellType = "n"; // Number is default
}
dataType =
switch (cellType) {
case "b" -> XSSDataType.BOOL;
case "e" -> XSSDataType.ERROR;
case "d" -> XSSDataType.DATE; // Date in ISO 8601 format.
case "inlineStr" -> XSSDataType.INLINE_STRING;
case "s" -> XSSDataType.SST_STRING;
case "str" -> XSSDataType.FORMULA_STRING; // String formula
default -> XSSDataType.NUMBER;
};
// Read the format for NUMBER
numberFormat = null;
if (dataType == XSSDataType.NUMBER) {
String cellStyleStr = attributes.getValue("s");
if (cellStyleStr != null) {
short styleIndex = (short) Integer.parseInt(cellStyleStr);
numberFormat = styles.getNumberFormatAt(styleIndex);
}
}
break;
case "is": // Inline String
isIsOpen = true;
break;
}
}
}
/** Captures characters if a suitable element is open. */
@Override
public void characters(char[] ch, int start, int length) {
if (vIsOpen) {
value.append(ch, start, length);
}
}
@Override
public void endElement(String uri, String localName, String qName) {
if (uri != null && !NS_SPREADSHEETML.equals(uri)) {
return;
}
if (isTextTag(localName)) {
vIsOpen = false;
} else {
switch (localName) {
case "sheetData" -> onSheetEnd();
case "row" -> {
nextRowNumber = rowNumber + 1;
rowNumber = -1;
}
case "c" -> outputCellValue();
case "is" -> isIsOpen = false;
case "v" -> vIsOpen = false;
}
}
}
public record CellValue(XSSDataType dataType, String strValue, String format) {
public boolean getBooleanValue() {
return strValue.charAt(0) == '1';
}
public double getNumberValue() {
return Double.parseDouble(strValue);
}
public long getIntegerValue() {
return Long.parseLong(strValue);
}
public Temporal getDateValue(boolean use1904Dates) {
return use1904Dates
? ExcelUtils.fromExcelDateTime1904(getIntegerValue())
: ExcelUtils.fromExcelDateTime(getIntegerValue());
}
public Temporal getDateTimeValue(boolean use1904Dates) {
if (use1904Dates) {
var datetime = ExcelUtils.fromExcelDateTime1904(getNumberValue());
if (datetime instanceof ZonedDateTime zdt
&& zdt.getYear() == 1904
&& zdt.getDayOfYear() == 1
&& !format.contains("y")
&& !format.contains("M")
&& !format.contains("d")) {
datetime = zdt.toLocalTime();
}
return datetime;
}
return ExcelUtils.fromExcelDateTime(getNumberValue());
}
}
public String getStringValue() {
if (dataType == XSSDataType.SST_STRING) {
return getSharedString(value.toString());
} else if (dataType == XSSDataType.INLINE_STRING) {
return new XSSFRichTextString(value.toString()).toString();
}
return value.toString();
}
private String getSharedString(String value) {
int idx = Integer.parseInt(value);
var ss = sharedStrings.getItemAt(idx);
return ss == null ? null : ss.toString();
}
private void outputCellValue() {
short columnNumber = 0;
int i = 0;
char c;
while (i < cellRef.length() && (c = cellRef.charAt(i)) >= 'A' && c <= 'Z') {
columnNumber = (short) (columnNumber * 26 + (c - 'A' + 1));
i++;
}
if (!seenValue) {
onCell(rowNumber, columnNumber, cellRef, new CellValue(XSSDataType.BLANK, "", null));
return;
}
var stringValue = getStringValue();
if (dataType == XSSDataType.NUMBER) {
boolean isInteger = !stringValue.contains(".");
boolean isDate = DateUtil.isADateFormat(-1, numberFormat);
if (isInteger && isDate) {
dataType = XSSDataType.OLE_DATE;
} else if (isInteger) {
dataType = XSSDataType.INTEGER;
} else if (isDate) {
dataType = XSSDataType.OLE_DATETIME;
}
}
var cellValue = new CellValue(dataType, stringValue, numberFormat);
onCell(rowNumber, columnNumber, cellRef, cellValue);
}
protected void onDimensions(String dimension) {}
protected void onStartRow(int rowNumber) {}
protected void onCell(int rowNumber, short columnNumber, String ref, CellValue cellValue) {}
protected void onSheetEnd() {}
}

View File

@ -0,0 +1,284 @@
package org.enso.table.excel.xssfreader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.poi.ooxml.util.DocumentHelper;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.RichTextString;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStrings;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.enso.table.excel.ExcelSheet;
import org.enso.table.excel.ExcelWorkbook;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
public class XSSFReaderWorkbook implements ExcelWorkbook {
private static final XPathFactory xpathFactory = XPathFactory.newInstance();
private static final NamespaceContext namespaceContext = new SpreadsheetContext();
private static final Map<String, XPathExpression> xpathCache = new HashMap<>();
private static XPathExpression compileXPathWithNamespace(String xpath)
throws XPathExpressionException {
if (!xpathCache.containsKey(xpath)) {
var newXPath = xpathFactory.newXPath();
newXPath.setNamespaceContext(namespaceContext);
var compiled = newXPath.compile(xpath);
xpathCache.put(xpath, compiled);
}
return xpathCache.get(xpath);
}
private static class SpreadsheetContext implements NamespaceContext {
@Override
public String getNamespaceURI(String prefix) {
if (prefix == null) {
throw new IllegalArgumentException("prefix cannot be null");
}
return prefix.equals("ss") ? XSSFRelation.NS_SPREADSHEETML : XMLConstants.NULL_NS_URI;
}
@Override
public String getPrefix(String namespaceURI) {
if (namespaceURI == null) {
throw new IllegalArgumentException("namespaceURI cannot be null");
}
return namespaceURI.equals(XSSFRelation.NS_SPREADSHEETML) ? "ss" : null;
}
@Override
public Iterator<String> getPrefixes(String namespaceURI) {
if (namespaceURI == null) {
throw new IllegalArgumentException("namespaceURI cannot be null");
}
return namespaceURI.equals(XSSFRelation.NS_SPREADSHEETML)
? Collections.singleton("ss").iterator()
: Arrays.stream(new String[0]).iterator();
}
}
public static final String WORKBOOK_CONFIG_XPATH = "/ss:workbook/ss:workbookPr";
public static final String SHEET_NAME_XPATH = "/ss:workbook/ss:sheets/ss:sheet";
public static final String NAMED_RANGE_XPATH = "/ss:workbook/ss:definedNames/ss:definedName";
private final String path;
private boolean use1904DateSystemFlag = false;
private List<SheetInfo> sheetInfos;
private Map<String, SheetInfo> sheetInfoMap;
private Map<String, NamedRange> namedRangeMap;
private boolean hasReadShared = false;
private SharedStrings sharedStrings;
private XSSFReaderFormats styles;
public XSSFReaderWorkbook(String path) throws IOException {
this.path = path;
// Read the workbook data
this.readWorkbookData();
}
public String getPath() {
return path;
}
void withReader(Consumer<XSSFReader> action) throws IOException {
try (var pkg = OPCPackage.open(path, PackageAccess.READ)) {
var reader = new XSSFReader(pkg);
action.accept(reader);
} catch (OpenXML4JException e) {
throw new IOException(
"Invalid format encountered when opening the file " + path + " as XLSX.", e);
}
}
private record SheetInfo(int index, int sheetId, String name, String relID, boolean visible) {}
private record NamedRange(String name, String formula) {}
private void readWorkbookData() throws IOException {
withReader(
reader -> {
try {
var workbookData = reader.getWorkbookData();
var workbookDoc = DocumentHelper.readDocument(workbookData);
read1904DateSetting(workbookDoc);
readSheetInfo(workbookDoc);
readNamedRanges(workbookDoc);
} catch (SAXException
| IOException
| InvalidFormatException
| XPathExpressionException e) {
throw new RuntimeException(e);
}
});
}
private void readNamedRanges(Document workbookDoc) throws XPathExpressionException {
var namesXPath = compileXPathWithNamespace(NAMED_RANGE_XPATH);
var nameNodes = (NodeList) namesXPath.evaluate(workbookDoc, XPathConstants.NODESET);
namedRangeMap = new HashMap<>();
for (int i = 0; i < nameNodes.getLength(); i++) {
var node = nameNodes.item(i);
var name = node.getAttributes().getNamedItem("name").getNodeValue();
var formula = node.getTextContent();
namedRangeMap.put(name, new NamedRange(name, formula));
}
}
private void readSheetInfo(Document workbookDoc) throws XPathExpressionException {
var sheetXPath = compileXPathWithNamespace(SHEET_NAME_XPATH);
var sheetNodes = (NodeList) sheetXPath.evaluate(workbookDoc, XPathConstants.NODESET);
sheetInfos = new ArrayList<>(sheetNodes.getLength());
sheetInfoMap = new HashMap<>();
for (int i = 0; i < sheetNodes.getLength(); i++) {
var node = sheetNodes.item(i);
var sheetName = node.getAttributes().getNamedItem("name").getNodeValue();
var sheetId = Integer.parseInt(node.getAttributes().getNamedItem("sheetId").getNodeValue());
var relId = node.getAttributes().getNamedItem("r:id").getNodeValue();
var visible = node.getAttributes().getNamedItem("state") == null;
var sheetInfo = new SheetInfo(i, sheetId, sheetName, relId, visible);
sheetInfos.add(sheetInfo);
sheetInfoMap.put(sheetName, sheetInfo);
}
}
private void read1904DateSetting(Document workbookDoc) throws XPathExpressionException {
var workbookXPath = compileXPathWithNamespace(WORKBOOK_CONFIG_XPATH);
var workbookNode = (Node) workbookXPath.evaluate(workbookDoc, XPathConstants.NODE);
if (workbookNode != null) {
var date1904 = workbookNode.getAttributes().getNamedItem("date1904");
use1904DateSystemFlag = date1904 != null && "1".equals(date1904.getNodeValue());
}
}
private synchronized void ensureReadShared() {
if (hasReadShared) {
return;
}
try {
withReader(
reader -> {
try {
reader.setUseReadOnlySharedStringsTable(true);
sharedStrings = reader.getSharedStringsTable();
if (sharedStrings == null) {
sharedStrings =
new SharedStrings() {
@Override
public RichTextString getItemAt(int idx) {
return null;
}
@Override
public int getCount() {
return 0;
}
@Override
public int getUniqueCount() {
return 0;
}
};
}
// Read the styles table and attach the format data
var stylesTable = reader.getStylesTable();
styles = new XSSFReaderFormats(stylesTable);
hasReadShared = true;
} catch (InvalidFormatException | IOException e) {
throw new RuntimeException(e);
}
});
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/** Flag that workbook is in 1904 format. */
boolean use1904Format() {
return use1904DateSystemFlag;
}
@Override
public int getNumberOfSheets() {
return sheetInfoMap.size();
}
@Override
public int getSheetIndex(String name) {
if (!sheetInfoMap.containsKey(name)) {
return -1;
}
return sheetInfoMap.get(name).index;
}
@Override
public String getSheetName(int sheet) {
if (sheet < 0 || sheet >= sheetInfos.size()) {
throw new IllegalArgumentException("Sheet index out of range: " + sheet);
}
return sheetInfos.get(sheet).name;
}
@Override
public int getNumberOfNames() {
return namedRangeMap.size();
}
@Override
public String[] getRangeNames() {
return namedRangeMap.keySet().toArray(String[]::new);
}
@Override
public String getNameFormula(String name) {
var namedRange = namedRangeMap.get(name);
return namedRange == null ? null : namedRange.formula;
}
public SharedStrings getSharedStrings() {
ensureReadShared();
return sharedStrings;
}
public XSSFReaderFormats getStyles() {
ensureReadShared();
return styles;
}
@Override
public ExcelSheet getSheetAt(int sheetIndex) {
if (sheetIndex < 0 || sheetIndex >= sheetInfos.size()) {
throw new IllegalArgumentException("Sheet index out of range: " + sheetIndex);
}
var sheetInfo = sheetInfos.get(sheetIndex);
return new XSSFReaderSheet(sheetIndex, sheetInfo.name, sheetInfo.relID, this);
}
@Override
public void close() throws IOException {
// Nothing to do
}
}

View File

@ -7,9 +7,6 @@ import java.util.List;
import java.util.function.Function; import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.IntStream; import java.util.stream.IntStream;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Name;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.util.CellReference; import org.apache.poi.ss.util.CellReference;
import org.enso.table.data.column.builder.Builder; import org.enso.table.data.column.builder.Builder;
import org.enso.table.data.column.builder.InferredBuilder; import org.enso.table.data.column.builder.InferredBuilder;
@ -24,6 +21,7 @@ import org.enso.table.excel.ExcelHeaders;
import org.enso.table.excel.ExcelRange; import org.enso.table.excel.ExcelRange;
import org.enso.table.excel.ExcelRow; import org.enso.table.excel.ExcelRow;
import org.enso.table.excel.ExcelSheet; import org.enso.table.excel.ExcelSheet;
import org.enso.table.excel.ExcelWorkbook;
import org.enso.table.excel.ReadOnlyExcelConnection; import org.enso.table.excel.ReadOnlyExcelConnection;
import org.enso.table.problems.ProblemAggregator; import org.enso.table.problems.ProblemAggregator;
import org.graalvm.polyglot.Context; import org.graalvm.polyglot.Context;
@ -38,18 +36,17 @@ public class ExcelReader {
* @return a String[] containing the sheet names. * @return a String[] containing the sheet names.
* @throws IOException when the action fails * @throws IOException when the action fails
*/ */
public static String[] readSheetNames(File file, ExcelFileFormat format) public static String[] readSheetNames(File file, ExcelFileFormat format) throws IOException {
throws IOException, InvalidFormatException {
return withWorkbook(file, format, ExcelReader::readSheetNames); return withWorkbook(file, format, ExcelReader::readSheetNames);
} }
/** /**
* Reads a list of sheet names from a workbook into an array. * Reads a list of sheet names from a workbook into an array.
* *
* @param workbook a {@link Workbook} to read the sheet names from. * @param workbook a {@link ExcelWorkbook} to read the sheet names from.
* @return a String[] containing the sheet names. * @return a String[] containing the sheet names.
*/ */
public static String[] readSheetNames(Workbook workbook) { public static String[] readSheetNames(ExcelWorkbook workbook) {
int sheetCount = workbook.getNumberOfSheets(); int sheetCount = workbook.getNumberOfSheets();
var output = new String[sheetCount]; var output = new String[sheetCount];
Context context = Context.getCurrent(); Context context = Context.getCurrent();
@ -68,20 +65,8 @@ public class ExcelReader {
* @return a String[] containing the range names. * @return a String[] containing the range names.
* @throws IOException when the action fails * @throws IOException when the action fails
*/ */
public static String[] readRangeNames(File file, ExcelFileFormat format) public static String[] readRangeNames(File file, ExcelFileFormat format) throws IOException {
throws IOException, InvalidFormatException { return withWorkbook(file, format, ExcelWorkbook::getRangeNames);
return withWorkbook(file, format, ExcelReader::readRangeNames);
}
/**
* Reads a list of range names for the specified XLSX/XLS file into an array.
*
* @param workbook a {@link Workbook} to read the sheet names from.
* @return a String[] containing the range names.
*/
public static String[] readRangeNames(Workbook workbook) {
var names = workbook.getAllNames();
return names.stream().map(Name::getNameName).toArray(String[]::new);
} }
/** /**
@ -202,7 +187,7 @@ public class ExcelReader {
/** /**
* Reads a range by sheet name, named range or address for the workbook into a table. * Reads a range by sheet name, named range or address for the workbook into a table.
* *
* @param workbook a {@link Workbook} to read from. * @param workbook a {@link ExcelWorkbook} to read from.
* @param rangeNameOrAddress sheet name, range name or address to read. * @param rangeNameOrAddress sheet name, range name or address to read.
* @param headers specifies whether the first row should be used as headers. * @param headers specifies whether the first row should be used as headers.
* @param skip_rows skip rows from the top of the range. * @param skip_rows skip rows from the top of the range.
@ -211,7 +196,7 @@ public class ExcelReader {
* @throws InvalidLocationException when the range name or address is not found. * @throws InvalidLocationException when the range name or address is not found.
*/ */
public static Table readRangeByName( public static Table readRangeByName(
Workbook workbook, ExcelWorkbook workbook,
String rangeNameOrAddress, String rangeNameOrAddress,
ExcelHeaders.HeaderBehavior headers, ExcelHeaders.HeaderBehavior headers,
int skip_rows, int skip_rows,
@ -230,11 +215,10 @@ public class ExcelReader {
problemAggregator); problemAggregator);
} }
Name name = workbook.getName(rangeNameOrAddress);
ExcelRange excelRange; ExcelRange excelRange;
try { try {
excelRange = new ExcelRange(name == null ? rangeNameOrAddress : name.getRefersToFormula()); var formula = workbook.getNameFormula(rangeNameOrAddress);
excelRange = new ExcelRange(formula == null ? rangeNameOrAddress : formula);
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
throw new InvalidLocationException( throw new InvalidLocationException(
rangeNameOrAddress, rangeNameOrAddress,
@ -271,8 +255,8 @@ public class ExcelReader {
readRange(workbook, excelRange, headers, skip_rows, row_limit, problemAggregator)); readRange(workbook, excelRange, headers, skip_rows, row_limit, problemAggregator));
} }
private static <T> T withWorkbook(File file, ExcelFileFormat format, Function<Workbook, T> action) private static <T> T withWorkbook(
throws IOException { File file, ExcelFileFormat format, Function<ExcelWorkbook, T> action) throws IOException {
try (ReadOnlyExcelConnection connection = try (ReadOnlyExcelConnection connection =
ExcelConnectionPool.INSTANCE.openReadOnlyConnection(file, format)) { ExcelConnectionPool.INSTANCE.openReadOnlyConnection(file, format)) {
return connection.withWorkbook(action); return connection.withWorkbook(action);
@ -280,7 +264,7 @@ public class ExcelReader {
} }
public static Table readRange( public static Table readRange(
Workbook workbook, ExcelWorkbook workbook,
ExcelRange excelRange, ExcelRange excelRange,
ExcelHeaders.HeaderBehavior headers, ExcelHeaders.HeaderBehavior headers,
int skip_rows, int skip_rows,
@ -304,7 +288,7 @@ public class ExcelReader {
} }
private static Table readTable( private static Table readTable(
Workbook workbook, ExcelWorkbook workbook,
int sheetIndex, int sheetIndex,
ExcelRange excelRange, ExcelRange excelRange,
ExcelHeaders.HeaderBehavior headers, ExcelHeaders.HeaderBehavior headers,
@ -312,7 +296,7 @@ public class ExcelReader {
int rowCount, int rowCount,
ProblemAggregator problemAggregator) { ProblemAggregator problemAggregator) {
ExcelSheet sheet = new ExcelSheet(workbook, sheetIndex); ExcelSheet sheet = workbook.getSheetAt(sheetIndex);
// Expand Single Cell // Expand Single Cell
if (excelRange != null && excelRange.isSingleCell()) { if (excelRange != null && excelRange.isSingleCell()) {

View File

@ -77,7 +77,8 @@ public class ExcelWriter {
headers = headers =
headers != ExcelHeaders.HeaderBehavior.INFER headers != ExcelHeaders.HeaderBehavior.INFER
? headers ? headers
: shouldWriteHeaders(new ExcelSheet(workbook, sheetIndex), firstRow + 1, 1, -1); : shouldWriteHeaders(
ExcelSheet.forPOIUserModel(workbook, sheetIndex), firstRow + 1, 1, -1);
String sheetName = workbook.getSheetName(sheetIndex - 1); String sheetName = workbook.getSheetName(sheetIndex - 1);
workbook.removeSheetAt(sheetIndex - 1); workbook.removeSheetAt(sheetIndex - 1);
@ -130,7 +131,8 @@ public class ExcelWriter {
headers = headers =
headers != ExcelHeaders.HeaderBehavior.INFER headers != ExcelHeaders.HeaderBehavior.INFER
? headers ? headers
: shouldWriteHeaders(new ExcelSheet(workbook, sheetIndex), firstRow + 1, 1, -1); : shouldWriteHeaders(
ExcelSheet.forPOIUserModel(workbook, sheetIndex), firstRow + 1, 1, -1);
workbook.removeSheetAt(sheetIndex); workbook.removeSheetAt(sheetIndex);
Sheet sheet = workbook.createSheet(sheetName); Sheet sheet = workbook.createSheet(sheetName);
@ -198,7 +200,7 @@ public class ExcelWriter {
throw new InvalidLocationException( throw new InvalidLocationException(
range.getSheetName(), "Unknown sheet '" + range.getSheetName() + "'."); range.getSheetName(), "Unknown sheet '" + range.getSheetName() + "'.");
} }
ExcelSheet sheet = new ExcelSheet(workbook, sheetIndex); ExcelSheet sheet = ExcelSheet.forPOIUserModel(workbook, sheetIndex);
if (skipRows != 0) { if (skipRows != 0) {
if (range.isWholeColumn()) { if (range.isWholeColumn()) {

View File

@ -1112,7 +1112,7 @@ add_specs suite_builder =
Problems.expect_warning Duplicate_Output_Column_Names r3 Problems.expect_warning Duplicate_Output_Column_Names r3
Problems.expect_warning Column_Count_Mismatch r3 Problems.expect_warning Column_Count_Mismatch r3
group_builder.specify "during `read_many`, should correctly handle empty sheets" <| group_builder.specify "during `read_many`, should correctly handle empty sheets" <|
with_temp_dir base_dir-> with_temp_dir base_dir->
tsv_file = base_dir / "1.tsv" tsv_file = base_dir / "1.tsv"