Excel Reading (XLSX) using SAX Default Handler (#10877)

- Implement Excel reading as a SAXMLParser.
This commit is contained in:
James Dunkerley 2024-12-13 21:01:16 +00:00 committed by GitHub
parent e6bcd5e485
commit 63ed629210
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 1267 additions and 229 deletions

View File

@ -49,7 +49,7 @@ type Excel_Workbook
- file: The file to load.
- xls_format: Whether to use the old XLS format (default is XLSX).
new : File | Temporary_File -> Boolean -> Excel_Workbook
new file:(File | Temporary_File) xls_format=False =
new file:(File | Temporary_File) xls_format:Boolean=False =
file_for_errors = if file.is_a Temporary_File then Nothing else file
continuation raw_file =
@ -73,7 +73,7 @@ type Excel_Workbook
- xls_format: Whether to use the old XLS format (default is XLSX).
- file: Optional file reference.
from_stream : Input_Stream -> Boolean -> File | Nothing -> Excel_Workbook
from_stream stream xls_format=False file=Nothing = Excel_Reader.handle_bad_format file <|
from_stream stream xls_format:Boolean=False file=Nothing = Excel_Reader.handle_bad_format file <|
temp_file = Temporary_File.from_stream_light stream
Excel_Workbook.new temp_file xls_format
@ -89,8 +89,8 @@ type Excel_Workbook
## PRIVATE
ICON metadata
Returns the list of databases (or catalogs) for the connection.
databases : Nothing
databases self = Nothing
databases : Vector (Text | Nothing)
databases self = [Nothing]
## PRIVATE
ICON metadata
@ -109,7 +109,7 @@ type Excel_Workbook
Arguments:
- database: The target file to open as an Excel_Workbook.
set_database : Text | File -> Excel_Workbook ! Illegal_Argument
set_database self database =
set_database self database:(Text | File) =
if database == self.database then self else
file = File.new database
if file.exists && file.is_directory.not then Excel_Workbook.new file self.xls_format else
@ -163,7 +163,7 @@ type Excel_Workbook
Gets the names of all the named ranges.
named_ranges : Vector Text
named_ranges self = self.with_java_workbook java_workbook->
Vector.from_polyglot_array (ExcelReader.readRangeNames java_workbook)
Vector.from_polyglot_array java_workbook.getRangeNames
## PRIVATE
ICON metadata

View File

@ -20,6 +20,7 @@ import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.enso.table.excel.xssfreader.XSSFReaderWorkbook;
public class ExcelConnectionPool {
public static final ExcelConnectionPool INSTANCE = new ExcelConnectionPool();
@ -64,7 +65,7 @@ public class ExcelConnectionPool {
record.refCount = 1;
record.file = file;
record.format = format;
record.workbook = openWorkbook(file, format, false);
record.reopen(true);
records.put(key, record);
return new ReadOnlyExcelConnection(this, key, record);
}
@ -212,10 +213,10 @@ public class ExcelConnectionPool {
private int refCount;
private File file;
private ExcelFileFormat format;
private Workbook workbook;
private ExcelWorkbook workbook;
private IOException initializationException = null;
<T> T withWorkbook(Function<Workbook, T> action) throws IOException {
<T> T withWorkbook(Function<ExcelWorkbook, T> action) throws IOException {
synchronized (this) {
return action.apply(accessCurrentWorkbook());
}
@ -238,7 +239,10 @@ public class ExcelConnectionPool {
}
try {
workbook = openWorkbook(file, format, false);
workbook =
format == ExcelFileFormat.XLSX
? new XSSFReaderWorkbook(file.getAbsolutePath())
: ExcelWorkbook.forPOIUserModel(openWorkbook(file, format, false));
} catch (IOException e) {
initializationException = e;
if (throwOnFailure) {
@ -248,7 +252,7 @@ public class ExcelConnectionPool {
}
}
private Workbook accessCurrentWorkbook() throws IOException {
private ExcelWorkbook accessCurrentWorkbook() throws IOException {
synchronized (this) {
if (workbook == null) {
if (initializationException != null) {
@ -278,7 +282,7 @@ public class ExcelConnectionPool {
throw e;
}
}
case XLSX -> {
case XLSX, XLSX_FALLBACK -> {
try {
PackageAccess access = writeAccess ? PackageAccess.READ_WRITE : PackageAccess.READ;
OPCPackage pkg = OPCPackage.open(file, access);
@ -300,7 +304,7 @@ public class ExcelConnectionPool {
private static Workbook createEmptyWorkbook(ExcelFileFormat format) {
return switch (format) {
case XLS -> new HSSFWorkbook();
case XLSX -> new XSSFWorkbook();
case XLSX, XLSX_FALLBACK -> new XSSFWorkbook();
};
}

View File

@ -2,5 +2,6 @@ package org.enso.table.excel;
public enum ExcelFileFormat {
XLS,
XLSX
XLSX,
XLSX_FALLBACK
}

View File

@ -57,7 +57,7 @@ public class ExcelHeaders {
String[] output = new String[currentEndCol - startCol + 1];
for (int col = startCol; col <= currentEndCol; col++) {
String cellText = row.getFormattedCell(col);
String cellText = row.getCellText(col);
String name = cellText.isEmpty() ? "" : deduplicator.makeUnique(cellText);
output[col - startCol] = name;

View File

@ -197,7 +197,7 @@ public class ExcelRange {
Context context = Context.getCurrent();
while (currentRow != null && !currentRow.isEmpty(excelRange.getLeftColumn(), rightColumn)) {
rightColumn = currentRow.findEndRight(rightColumn);
rightColumn = findEndRight(currentRow, rightColumn);
bottomRow++;
currentRow = sheet.get(bottomRow);
@ -212,6 +212,16 @@ public class ExcelRange {
bottomRow - 1);
}
private static int findEndRight(ExcelRow row, int start) {
Context context = Context.getCurrent();
int column = start;
while (!row.isEmpty(column + 1)) {
column++;
context.safepoint();
}
return column;
}
/**
* @param index The index to the next character after the parsed value
* @param value Parsed integer value or 0 if not valid

View File

@ -10,24 +10,63 @@ import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.ExcelNumberFormat;
import org.apache.poi.ss.usermodel.FormulaError;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.graalvm.polyglot.Context;
/** Wrapper class to handle Excel rows. */
public class ExcelRow {
private static final DataFormatter formatter = new DataFormatter();
public interface ExcelRow {
/** Gets the initial column index within the row (1-based). */
int getFirstColumn();
private final Row row;
private final int firstColumn;
private final int lastColumn;
private final boolean use1904Format;
/** Gets the final column index within the row (1-based). */
int getLastColumn();
public ExcelRow(Row row, boolean use1904Format) {
this.row = row;
this.firstColumn = row.getFirstCellNum() + 1;
this.lastColumn = row.getLastCellNum();
this.use1904Format = use1904Format;
/** Gets the cell at the given index within the row (1-based). */
Object getCellValue(int column);
/** Gets the text of a cell at the given index within the row (1-based). */
String getCellText(int column);
/** Gets the cell at the given index within the row (1-based). */
Cell get(int column);
/** Checks if the specified cell is empty. */
boolean isEmpty(int column);
/** Checks if the specified set of cells are empty. */
boolean isEmpty(int start, int end);
/** Gets the cells as text. */
String[] getCellsAsText(int startCol, int endCol);
/** Gets the underlying Apache POI Sheet object. */
static ExcelRow forPOIUserModel(Sheet sheet, int rowIndex, boolean use1904Format) {
var row = sheet.getRow(rowIndex - 1);
return row == null
? null
: new ExcelRowFromPOIUserModel(
row, row.getFirstCellNum() + 1, row.getLastCellNum(), use1904Format);
}
static boolean isEmptyHelper(ExcelRow row, int start, int end) {
Context context = Context.getCurrent();
int currentEnd = end == -1 ? row.getLastColumn() : end;
for (int column = Math.max(row.getFirstColumn(), start);
column <= Math.min(row.getLastColumn(), currentEnd);
column++) {
if (!row.isEmpty(column)) {
return false;
}
context.safepoint();
}
return true;
}
record ExcelRowFromPOIUserModel(Row row, int firstColumn, int lastColumn, boolean use1904Format)
implements ExcelRow {
private static final DataFormatter formatter = new DataFormatter();
public int getFirstColumn() {
return firstColumn;
}
@ -93,52 +132,8 @@ public class ExcelRow {
}
}
public static CellType getCellType(Cell cell) {
if (cell == null) {
return CellType._NONE;
}
CellType cellType = cell.getCellType();
if (cellType == CellType.FORMULA) {
cellType = cell.getCachedFormulaResultType();
}
return cellType;
}
public boolean isEmpty(int column) {
CellType cellType = getCellType(get(column));
return (cellType == CellType._NONE) || (cellType == CellType.BLANK);
}
public boolean isEmpty(int start, int end) {
Context context = Context.getCurrent();
int currentEnd = end == -1 ? getLastColumn() : end;
for (int column = Math.max(getFirstColumn(), start);
column <= Math.min(getLastColumn(), currentEnd);
column++) {
if (!isEmpty(column)) {
return false;
}
context.safepoint();
}
return true;
}
public int findEndRight(int start) {
Context context = Context.getCurrent();
int column = start;
while (!isEmpty(column + 1)) {
column++;
context.safepoint();
}
return column;
}
/** Returns the formatted cell value. */
public String getFormattedCell(int col) {
var cell = get(col);
public String getCellText(int column) {
var cell = get(column);
if (cell == null) {
return "";
}
@ -161,12 +156,21 @@ public class ExcelRow {
}
default -> {
// Use the default read and then toString.
var value = getCellValue(col);
var value = getCellValue(column);
yield value == null ? "" : value.toString();
}
};
}
public boolean isEmpty(int column) {
CellType cellType = getCellType(get(column));
return (cellType == CellType._NONE) || (cellType == CellType.BLANK);
}
public boolean isEmpty(int start, int end) {
return isEmptyHelper(this, start, end);
}
public String[] getCellsAsText(int startCol, int endCol) {
Context context = Context.getCurrent();
int currentEndCol = endCol == -1 ? getLastColumn() : endCol;
@ -174,7 +178,7 @@ public class ExcelRow {
String[] output = new String[currentEndCol - startCol + 1];
for (int col = startCol; col <= currentEndCol; col++) {
Cell cell = get(col);
CellType type = ExcelRow.getCellType(cell);
CellType type = getCellType(cell);
if (type != CellType._NONE && type != CellType.BLANK && type != CellType.STRING) {
return null;
}
@ -185,4 +189,18 @@ public class ExcelRow {
return output;
}
private static CellType getCellType(Cell cell) {
if (cell == null) {
return CellType._NONE;
}
CellType cellType = cell.getCellType();
if (cellType == CellType.FORMULA) {
cellType = cell.getCachedFormulaResultType();
}
return cellType;
}
}
}

View File

@ -1,37 +1,83 @@
package org.enso.table.excel;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
/** Wrapper class to handle Excel sheets. */
public class ExcelSheet {
private final Sheet sheet;
private final int firstRow;
private final int lastRow;
private final boolean use1904Format;
public interface ExcelSheet {
/** Gets the index of the sheet within the workbook (0-based). */
int getSheetIndex();
public ExcelSheet(Workbook workbook, int sheetIndex) {
this.sheet = workbook.getSheetAt(sheetIndex);
this.firstRow = sheet.getFirstRowNum() + 1;
this.lastRow = sheet.getLastRowNum() + 1;
this.use1904Format = ExcelUtils.is1904DateSystem(workbook);
/** Gets the name of the sheet. */
String getName();
/** Gets the initial row index within the sheet (1-based). */
int getFirstRow();
/** Gets the final row index within the sheet (1-based). */
int getLastRow();
/**
* Gets the row at the given index within the sheet (1-based)
*
* @param row the row index (1-based)/
* @return the row object or null if the row index is out of range or doesn't exist.
*/
ExcelRow get(int row);
/** Gets the underlying Apache POI Sheet object - may be null. Provided for Writer use only. */
Sheet getSheet();
/** Gets the underlying Apache POI Sheet object. */
static ExcelSheet forPOIUserModel(Workbook workbook, int sheetIndex) {
var sheet = workbook.getSheetAt(sheetIndex);
return new ExcelSheetFromPOIUserModel(
sheet,
sheetIndex,
sheet.getSheetName(),
sheet.getFirstRowNum() + 1,
sheet.getLastRowNum() + 1,
ExcelUtils.is1904DateSystem(workbook));
}
public int getLastRow() {
return lastRow;
record ExcelSheetFromPOIUserModel(
Sheet sheet,
int sheetIndex,
String sheetName,
int firstRow,
int lastRow,
boolean use1904Format)
implements ExcelSheet {
@Override
public int getSheetIndex() {
return sheetIndex;
}
@Override
public String getName() {
return sheetName;
}
@Override
public int getFirstRow() {
return firstRow;
}
public ExcelRow get(int row) {
Row underlyingRow = row < firstRow || row > lastRow ? null : sheet.getRow(row - 1);
return underlyingRow == null ? null : new ExcelRow(underlyingRow, use1904Format);
@Override
public int getLastRow() {
return lastRow;
}
@Override
public ExcelRow get(int row) {
return row < firstRow || row > lastRow
? null
: ExcelRow.forPOIUserModel(sheet, row, use1904Format);
}
@Override
public Sheet getSheet() {
return sheet;
}
}
}

View File

@ -1,6 +1,10 @@
package org.enso.table.excel;
import java.time.*;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.time.temporal.ChronoUnit;
import java.time.temporal.Temporal;
import org.apache.poi.ss.usermodel.Workbook;

View File

@ -0,0 +1,123 @@
package org.enso.table.excel;
import java.io.IOException;
import org.apache.poi.ss.usermodel.Name;
/** Represents an Excel workbook. Wraps the underlying Apache POI Workbook object. */
public interface ExcelWorkbook {
/**
* Get the number of spreadsheets in the workbook
*
* @return the number of sheets
*/
int getNumberOfSheets();
/**
* Returns the index of the sheet by its name
*
* @param name the sheet name
* @return index of the sheet (0 based)
*/
int getSheetIndex(String name);
/**
* Get the sheet name
*
* @param sheet sheet number (0 based)
* @return Sheet name
*/
String getSheetName(int sheet);
/**
* @return the total number of defined names in this workbook
*/
int getNumberOfNames();
/**
* Get all the range names in the workbook
*
* @return an array of range names
*/
String[] getRangeNames();
/**
* Get the formula for a named range.
*
* @param name the name of the range.
* @return the formula for the range or null if not found.
*/
String getNameFormula(String name);
/**
* Get a sheet by its index
*
* @param sheetIndex the index of the sheet (0 based)
* @return the sheet as an ExcelSheet object
* @throws IllegalArgumentException if the sheet index is out of range.
*/
ExcelSheet getSheetAt(int sheetIndex);
/**
* Close the underlying input resource (File or Stream), from which the Workbook was read.
*
* <p>Once this has been called, no further operations, updates or reads should be performed on
* the Workbook.
*/
void close() throws IOException;
/**
* Create an ExcelWorkbook object from an Apache POI Workbook object
*
* @param workbook the Apache POI Workbook object
* @return the ExcelWorkbook object
*/
static ExcelWorkbook forPOIUserModel(org.apache.poi.ss.usermodel.Workbook workbook) {
return new ExcelWorkbookFromPOIUserModel(workbook);
}
// ** Wrap a Workbook object in the interface. */
record ExcelWorkbookFromPOIUserModel(org.apache.poi.ss.usermodel.Workbook workbook)
implements ExcelWorkbook {
@Override
public int getNumberOfSheets() {
return workbook.getNumberOfSheets();
}
@Override
public int getSheetIndex(String name) {
return workbook.getSheetIndex(name);
}
@Override
public String getSheetName(int sheet) {
return workbook.getSheetName(sheet);
}
@Override
public int getNumberOfNames() {
return workbook.getNumberOfNames();
}
@Override
public String[] getRangeNames() {
var names = workbook.getAllNames();
return names.stream().map(Name::getNameName).toArray(String[]::new);
}
@Override
public String getNameFormula(String name) {
var namedRange = workbook.getName(name);
return namedRange == null ? null : namedRange.getRefersToFormula();
}
@Override
public ExcelSheet getSheetAt(int sheetIndex) {
return ExcelSheet.forPOIUserModel(workbook, sheetIndex);
}
@Override
public void close() throws IOException {
workbook.close();
}
}
}

View File

@ -2,7 +2,6 @@ package org.enso.table.excel;
import java.io.IOException;
import java.util.function.Function;
import org.apache.poi.ss.usermodel.Workbook;
public class ReadOnlyExcelConnection implements AutoCloseable {
@ -28,7 +27,7 @@ public class ReadOnlyExcelConnection implements AutoCloseable {
record = null;
}
public synchronized <T> T withWorkbook(Function<Workbook, T> f) throws IOException {
public synchronized <T> T withWorkbook(Function<ExcelWorkbook, T> f) throws IOException {
if (record == null) {
throw new IllegalStateException("ReadOnlyExcelConnection is being used after it was closed.");
}

View File

@ -0,0 +1,29 @@
package org.enso.table.excel.xssfreader;
import java.util.HashMap;
import java.util.Map;
import org.apache.poi.xssf.model.StylesTable;
/** Provides the format strings for number formats in an XSSF workbook. */
public class XSSFReaderFormats {
private final StylesTable stylesTable;
private final Map<Short, String> numberFormats = new HashMap<>();
public XSSFReaderFormats(StylesTable stylesTable) {
this.stylesTable = stylesTable;
}
public String getNumberFormatAt(short styleIdx) {
if (numberFormats.containsKey(styleIdx)) {
return numberFormats.get(styleIdx);
}
var style = stylesTable.getStyleAt(styleIdx);
var format = style == null ? "General" : style.getDataFormatString();
if (format == null || format.equals("General")) {
format = "";
}
numberFormats.put(styleIdx, format);
return format;
}
}

View File

@ -0,0 +1,125 @@
package org.enso.table.excel.xssfreader;
import java.time.LocalDateTime;
import java.util.SortedMap;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.enso.table.excel.ExcelRow;
public class XSSFReaderRow implements ExcelRow {
private static final DataFormatter formatter = new DataFormatter();
private final SortedMap<Short, XSSFReaderSheetXMLHandler.CellValue> data;
private final boolean use1904Dates;
public XSSFReaderRow(
SortedMap<Short, XSSFReaderSheetXMLHandler.CellValue> data, boolean use1904Dates) {
this.data = data;
this.use1904Dates = use1904Dates;
}
@Override
public int getFirstColumn() {
return data.firstKey();
}
@Override
public int getLastColumn() {
return data.lastKey();
}
@Override
public Cell get(int column) {
// Not supported as we don't have the underlying Apache POI Cell object.
throw new UnsupportedOperationException("XSSFReader does not support getting the Cell object.");
}
@Override
public Object getCellValue(int column) {
var cell = data.get((short) column);
if (cell == null) {
return null;
}
var dataType = cell.dataType();
return switch (dataType) {
case BLANK -> null;
case BOOL -> cell.getBooleanValue();
case DATE -> LocalDateTime.parse(cell.strValue()); // Don't believe used by Excel.
case INLINE_STRING, SST_STRING, FORMULA_STRING -> cell.strValue();
case INTEGER -> cell.getIntegerValue();
case NUMBER -> {
double dbl = cell.getNumberValue();
long longVal = (long) dbl;
if (dbl == longVal) {
yield (long) dbl;
} else {
yield dbl;
}
}
case OLE_DATE -> cell.getDateValue(use1904Dates);
case OLE_DATETIME -> cell.getDateTimeValue(use1904Dates);
case ERROR -> null;
};
}
@Override
public String getCellText(int column) {
var cell = data.get((short) column);
if (cell == null) {
return "";
}
var dataType = cell.dataType();
return switch (dataType) {
case BLANK -> "";
case NUMBER, OLE_DATETIME, OLE_DATE, INTEGER -> {
// Special handling for Number or Date cells as want to keep formatting.
var formatText = cell.format();
if (formatText == null || formatText.isEmpty()) {
yield cell.strValue();
}
yield formatter.formatRawCellContents(cell.getNumberValue(), -1, formatText, use1904Dates);
}
case BOOL -> cell.getBooleanValue() ? "TRUE" : "FALSE";
default -> cell.strValue();
};
}
@Override
public boolean isEmpty(int column) {
var cell = data.get((short) column);
return cell == null || cell.strValue().isEmpty();
}
@Override
public boolean isEmpty(int start, int end) {
int currentEnd = end == -1 ? getLastColumn() : end;
for (int column = Math.max(getFirstColumn(), start);
column <= Math.min(getLastColumn(), currentEnd);
column++) {
if (!isEmpty(column)) {
return false;
}
}
return true;
}
@Override
public String[] getCellsAsText(int startCol, int endCol) {
int currentEndCol = endCol == -1 ? getLastColumn() : endCol;
String[] output = new String[currentEndCol - startCol + 1];
for (int col = startCol; col <= currentEndCol; col++) {
var cell = data.get((short) col);
if (cell != null && !cell.dataType().isString()) {
// Short circuit if find not a string cell.
return null;
}
output[col - startCol] = cell == null ? "" : cell.strValue();
}
return output;
}
}

View File

@ -0,0 +1,150 @@
package org.enso.table.excel.xssfreader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.util.XMLHelper;
import org.enso.table.excel.ExcelRow;
import org.enso.table.excel.ExcelSheet;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class XSSFReaderSheet implements ExcelSheet {
private final int sheetIdx;
private final String sheetName;
private final String relId;
private final XSSFReaderWorkbook parent;
private boolean hasReadSheetData = false;
private String dimensions;
private int firstRow;
private int lastRow;
private Map<Integer, SortedMap<Short, XSSFReaderSheetXMLHandler.CellValue>> rowData;
public XSSFReaderSheet(int sheetIdx, String sheetName, String relId, XSSFReaderWorkbook parent) {
this.sheetIdx = sheetIdx;
this.sheetName = sheetName;
this.relId = relId;
this.parent = parent;
}
private synchronized void ensureReadSheetData() {
if (hasReadSheetData) {
return;
}
try {
var strings = parent.getSharedStrings();
var styles = parent.getStyles();
var handler =
new XSSFReaderSheetXMLHandler(styles, strings) {
@Override
protected void onDimensions(String dimension) {
handleOnDimensions(dimension);
}
@Override
protected void onStartRow(int rowNum) {
handleOnStartRow(rowNum);
}
@Override
protected void onCell(int rowNumber, short columnNumber, String ref, CellValue value) {
handleOnCell(rowNumber, columnNumber, value);
}
};
var xmlReader = XMLHelper.newXMLReader();
xmlReader.setContentHandler(handler);
rowData = new HashMap<>();
try {
parent.withReader(
reader -> {
try {
var sheet = reader.getSheet(relId);
xmlReader.parse(new InputSource(sheet));
} catch (SAXException | InvalidFormatException | IOException e) {
throw new RuntimeException(e);
}
});
} catch (IOException e) {
throw new RuntimeException(e);
}
hasReadSheetData = true;
} catch (SAXException | ParserConfigurationException e) {
throw new RuntimeException(e);
}
}
@Override
public int getSheetIndex() {
return sheetIdx;
}
@Override
public String getName() {
return sheetName;
}
public String getDimensions() {
ensureReadSheetData();
return dimensions;
}
@Override
public int getFirstRow() {
ensureReadSheetData();
return firstRow;
}
@Override
public int getLastRow() {
ensureReadSheetData();
return lastRow;
}
@Override
public ExcelRow get(int row) {
ensureReadSheetData();
if (!rowData.containsKey(row)) {
return null;
}
return new XSSFReaderRow(rowData.get(row), parent.use1904Format());
}
@Override
public Sheet getSheet() {
// Not supported as we don't have the underlying Apache POI Sheet object.
throw new UnsupportedOperationException(
"XSSFReader does not support getting the Sheet object.");
}
protected void handleOnDimensions(String dimension) {
dimensions = dimension;
}
private void handleOnStartRow(int rowNum) {
if (firstRow == 0 || rowNum < firstRow) {
firstRow = rowNum;
}
if (lastRow == 0 || rowNum > lastRow) {
lastRow = rowNum;
}
}
private void handleOnCell(
int rowNumber, short columnNumber, XSSFReaderSheetXMLHandler.CellValue value) {
rowData.computeIfAbsent(rowNumber, k -> new TreeMap<>()).put(columnNumber, value);
}
}

View File

@ -0,0 +1,259 @@
package org.enso.table.excel.xssfreader;
import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML;
import java.time.ZonedDateTime;
import java.time.temporal.Temporal;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.xssf.model.SharedStrings;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.enso.table.excel.ExcelUtils;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
/** Based on the XSSFSheetXMLHandler class from Apache POI. */
/**
* SAX-based Handler to Read Excel XML on top of POI support. Technical specification can be found
* at:
* https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oe376/db9b9b72-b10b-4e7e-844c-09f88c972219
* https://ecma-international.org/publications-and-standards/standards/ecma-376/
*/
public class XSSFReaderSheetXMLHandler extends DefaultHandler {
private final XSSFReaderFormats styles;
private final SharedStrings sharedStrings;
public enum XSSDataType {
BLANK,
BOOL,
DATE,
ERROR,
INLINE_STRING,
SST_STRING,
NUMBER,
INTEGER,
OLE_DATE,
OLE_DATETIME,
FORMULA_STRING;
public boolean isString() {
return this == INLINE_STRING || this == SST_STRING || this == FORMULA_STRING;
}
}
// Record if seen a value element
private boolean seenValue;
// Set when V start element is seen
private boolean vIsOpen;
// Set when an Inline String "is" is seen
private boolean isIsOpen;
// The current row being read (or -1 if not in a row)
private int rowNumber = -1;
// Handle missing rowNumber in the XML (happens in Excel), first row would be row 1.
private int nextRowNumber = 1;
// The current cell being read (or null if not in a cell)
private String cellRef;
// Set when cell start element is seen, used when cell close element is seen.
private XSSDataType dataType;
// Gathers characters as they are seen.
private final StringBuilder value = new StringBuilder(64);
private String numberFormat = null;
public XSSFReaderSheetXMLHandler(XSSFReaderFormats styles, SharedStrings strings) {
this.styles = styles;
this.sharedStrings = strings;
}
private boolean isTextTag(String name) {
return "v".equals(name) || "inlineStr".equals(name) || ("t".equals(name) && isIsOpen);
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) {
if (uri != null && !NS_SPREADSHEETML.equals(uri)) {
return;
}
if (isTextTag(localName)) {
seenValue = true;
vIsOpen = true;
if (!isIsOpen) {
value.setLength(0);
}
} else {
switch (localName) {
case "dimension": // Dimensions of sheet
var dimension = attributes.getValue("ref");
if (dimension != null) {
onDimensions(dimension);
}
break;
case "row": // Row
String rowNumStr = attributes.getValue("r");
rowNumber = rowNumStr == null ? nextRowNumber : Integer.parseInt(rowNumStr);
onStartRow(rowNumber);
break;
case "c": // Cell
cellRef = attributes.getValue("r");
seenValue = false;
String cellType = attributes.getValue("t");
if (cellType == null) {
cellType = "n"; // Number is default
}
dataType =
switch (cellType) {
case "b" -> XSSDataType.BOOL;
case "e" -> XSSDataType.ERROR;
case "d" -> XSSDataType.DATE; // Date in ISO 8601 format.
case "inlineStr" -> XSSDataType.INLINE_STRING;
case "s" -> XSSDataType.SST_STRING;
case "str" -> XSSDataType.FORMULA_STRING; // String formula
default -> XSSDataType.NUMBER;
};
// Read the format for NUMBER
numberFormat = null;
if (dataType == XSSDataType.NUMBER) {
String cellStyleStr = attributes.getValue("s");
if (cellStyleStr != null) {
short styleIndex = (short) Integer.parseInt(cellStyleStr);
numberFormat = styles.getNumberFormatAt(styleIndex);
}
}
break;
case "is": // Inline String
isIsOpen = true;
break;
}
}
}
/** Captures characters if a suitable element is open. */
@Override
public void characters(char[] ch, int start, int length) {
if (vIsOpen) {
value.append(ch, start, length);
}
}
@Override
public void endElement(String uri, String localName, String qName) {
if (uri != null && !NS_SPREADSHEETML.equals(uri)) {
return;
}
if (isTextTag(localName)) {
vIsOpen = false;
} else {
switch (localName) {
case "sheetData" -> onSheetEnd();
case "row" -> {
nextRowNumber = rowNumber + 1;
rowNumber = -1;
}
case "c" -> outputCellValue();
case "is" -> isIsOpen = false;
case "v" -> vIsOpen = false;
}
}
}
public record CellValue(XSSDataType dataType, String strValue, String format) {
public boolean getBooleanValue() {
return strValue.charAt(0) == '1';
}
public double getNumberValue() {
return Double.parseDouble(strValue);
}
public long getIntegerValue() {
return Long.parseLong(strValue);
}
public Temporal getDateValue(boolean use1904Dates) {
return use1904Dates
? ExcelUtils.fromExcelDateTime1904(getIntegerValue())
: ExcelUtils.fromExcelDateTime(getIntegerValue());
}
public Temporal getDateTimeValue(boolean use1904Dates) {
if (use1904Dates) {
var datetime = ExcelUtils.fromExcelDateTime1904(getNumberValue());
if (datetime instanceof ZonedDateTime zdt
&& zdt.getYear() == 1904
&& zdt.getDayOfYear() == 1
&& !format.contains("y")
&& !format.contains("M")
&& !format.contains("d")) {
datetime = zdt.toLocalTime();
}
return datetime;
}
return ExcelUtils.fromExcelDateTime(getNumberValue());
}
}
public String getStringValue() {
if (dataType == XSSDataType.SST_STRING) {
return getSharedString(value.toString());
} else if (dataType == XSSDataType.INLINE_STRING) {
return new XSSFRichTextString(value.toString()).toString();
}
return value.toString();
}
private String getSharedString(String value) {
int idx = Integer.parseInt(value);
var ss = sharedStrings.getItemAt(idx);
return ss == null ? null : ss.toString();
}
private void outputCellValue() {
short columnNumber = 0;
int i = 0;
char c;
while (i < cellRef.length() && (c = cellRef.charAt(i)) >= 'A' && c <= 'Z') {
columnNumber = (short) (columnNumber * 26 + (c - 'A' + 1));
i++;
}
if (!seenValue) {
onCell(rowNumber, columnNumber, cellRef, new CellValue(XSSDataType.BLANK, "", null));
return;
}
var stringValue = getStringValue();
if (dataType == XSSDataType.NUMBER) {
boolean isInteger = !stringValue.contains(".");
boolean isDate = DateUtil.isADateFormat(-1, numberFormat);
if (isInteger && isDate) {
dataType = XSSDataType.OLE_DATE;
} else if (isInteger) {
dataType = XSSDataType.INTEGER;
} else if (isDate) {
dataType = XSSDataType.OLE_DATETIME;
}
}
var cellValue = new CellValue(dataType, stringValue, numberFormat);
onCell(rowNumber, columnNumber, cellRef, cellValue);
}
protected void onDimensions(String dimension) {}
protected void onStartRow(int rowNumber) {}
protected void onCell(int rowNumber, short columnNumber, String ref, CellValue cellValue) {}
protected void onSheetEnd() {}
}

View File

@ -0,0 +1,284 @@
package org.enso.table.excel.xssfreader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.poi.ooxml.util.DocumentHelper;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.RichTextString;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStrings;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.enso.table.excel.ExcelSheet;
import org.enso.table.excel.ExcelWorkbook;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
public class XSSFReaderWorkbook implements ExcelWorkbook {
private static final XPathFactory xpathFactory = XPathFactory.newInstance();
private static final NamespaceContext namespaceContext = new SpreadsheetContext();
private static final Map<String, XPathExpression> xpathCache = new HashMap<>();
private static XPathExpression compileXPathWithNamespace(String xpath)
throws XPathExpressionException {
if (!xpathCache.containsKey(xpath)) {
var newXPath = xpathFactory.newXPath();
newXPath.setNamespaceContext(namespaceContext);
var compiled = newXPath.compile(xpath);
xpathCache.put(xpath, compiled);
}
return xpathCache.get(xpath);
}
private static class SpreadsheetContext implements NamespaceContext {
@Override
public String getNamespaceURI(String prefix) {
if (prefix == null) {
throw new IllegalArgumentException("prefix cannot be null");
}
return prefix.equals("ss") ? XSSFRelation.NS_SPREADSHEETML : XMLConstants.NULL_NS_URI;
}
@Override
public String getPrefix(String namespaceURI) {
if (namespaceURI == null) {
throw new IllegalArgumentException("namespaceURI cannot be null");
}
return namespaceURI.equals(XSSFRelation.NS_SPREADSHEETML) ? "ss" : null;
}
@Override
public Iterator<String> getPrefixes(String namespaceURI) {
if (namespaceURI == null) {
throw new IllegalArgumentException("namespaceURI cannot be null");
}
return namespaceURI.equals(XSSFRelation.NS_SPREADSHEETML)
? Collections.singleton("ss").iterator()
: Arrays.stream(new String[0]).iterator();
}
}
public static final String WORKBOOK_CONFIG_XPATH = "/ss:workbook/ss:workbookPr";
public static final String SHEET_NAME_XPATH = "/ss:workbook/ss:sheets/ss:sheet";
public static final String NAMED_RANGE_XPATH = "/ss:workbook/ss:definedNames/ss:definedName";
private final String path;
private boolean use1904DateSystemFlag = false;
private List<SheetInfo> sheetInfos;
private Map<String, SheetInfo> sheetInfoMap;
private Map<String, NamedRange> namedRangeMap;
private boolean hasReadShared = false;
private SharedStrings sharedStrings;
private XSSFReaderFormats styles;
public XSSFReaderWorkbook(String path) throws IOException {
this.path = path;
// Read the workbook data
this.readWorkbookData();
}
public String getPath() {
return path;
}
void withReader(Consumer<XSSFReader> action) throws IOException {
try (var pkg = OPCPackage.open(path, PackageAccess.READ)) {
var reader = new XSSFReader(pkg);
action.accept(reader);
} catch (OpenXML4JException e) {
throw new IOException(
"Invalid format encountered when opening the file " + path + " as XLSX.", e);
}
}
private record SheetInfo(int index, int sheetId, String name, String relID, boolean visible) {}
private record NamedRange(String name, String formula) {}
private void readWorkbookData() throws IOException {
withReader(
reader -> {
try {
var workbookData = reader.getWorkbookData();
var workbookDoc = DocumentHelper.readDocument(workbookData);
read1904DateSetting(workbookDoc);
readSheetInfo(workbookDoc);
readNamedRanges(workbookDoc);
} catch (SAXException
| IOException
| InvalidFormatException
| XPathExpressionException e) {
throw new RuntimeException(e);
}
});
}
private void readNamedRanges(Document workbookDoc) throws XPathExpressionException {
var namesXPath = compileXPathWithNamespace(NAMED_RANGE_XPATH);
var nameNodes = (NodeList) namesXPath.evaluate(workbookDoc, XPathConstants.NODESET);
namedRangeMap = new HashMap<>();
for (int i = 0; i < nameNodes.getLength(); i++) {
var node = nameNodes.item(i);
var name = node.getAttributes().getNamedItem("name").getNodeValue();
var formula = node.getTextContent();
namedRangeMap.put(name, new NamedRange(name, formula));
}
}
private void readSheetInfo(Document workbookDoc) throws XPathExpressionException {
var sheetXPath = compileXPathWithNamespace(SHEET_NAME_XPATH);
var sheetNodes = (NodeList) sheetXPath.evaluate(workbookDoc, XPathConstants.NODESET);
sheetInfos = new ArrayList<>(sheetNodes.getLength());
sheetInfoMap = new HashMap<>();
for (int i = 0; i < sheetNodes.getLength(); i++) {
var node = sheetNodes.item(i);
var sheetName = node.getAttributes().getNamedItem("name").getNodeValue();
var sheetId = Integer.parseInt(node.getAttributes().getNamedItem("sheetId").getNodeValue());
var relId = node.getAttributes().getNamedItem("r:id").getNodeValue();
var visible = node.getAttributes().getNamedItem("state") == null;
var sheetInfo = new SheetInfo(i, sheetId, sheetName, relId, visible);
sheetInfos.add(sheetInfo);
sheetInfoMap.put(sheetName, sheetInfo);
}
}
private void read1904DateSetting(Document workbookDoc) throws XPathExpressionException {
var workbookXPath = compileXPathWithNamespace(WORKBOOK_CONFIG_XPATH);
var workbookNode = (Node) workbookXPath.evaluate(workbookDoc, XPathConstants.NODE);
if (workbookNode != null) {
var date1904 = workbookNode.getAttributes().getNamedItem("date1904");
use1904DateSystemFlag = date1904 != null && "1".equals(date1904.getNodeValue());
}
}
private synchronized void ensureReadShared() {
if (hasReadShared) {
return;
}
try {
withReader(
reader -> {
try {
reader.setUseReadOnlySharedStringsTable(true);
sharedStrings = reader.getSharedStringsTable();
if (sharedStrings == null) {
sharedStrings =
new SharedStrings() {
@Override
public RichTextString getItemAt(int idx) {
return null;
}
@Override
public int getCount() {
return 0;
}
@Override
public int getUniqueCount() {
return 0;
}
};
}
// Read the styles table and attach the format data
var stylesTable = reader.getStylesTable();
styles = new XSSFReaderFormats(stylesTable);
hasReadShared = true;
} catch (InvalidFormatException | IOException e) {
throw new RuntimeException(e);
}
});
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/** Flag that workbook is in 1904 format. */
boolean use1904Format() {
return use1904DateSystemFlag;
}
@Override
public int getNumberOfSheets() {
return sheetInfoMap.size();
}
@Override
public int getSheetIndex(String name) {
if (!sheetInfoMap.containsKey(name)) {
return -1;
}
return sheetInfoMap.get(name).index;
}
@Override
public String getSheetName(int sheet) {
if (sheet < 0 || sheet >= sheetInfos.size()) {
throw new IllegalArgumentException("Sheet index out of range: " + sheet);
}
return sheetInfos.get(sheet).name;
}
@Override
public int getNumberOfNames() {
return namedRangeMap.size();
}
@Override
public String[] getRangeNames() {
return namedRangeMap.keySet().toArray(String[]::new);
}
@Override
public String getNameFormula(String name) {
var namedRange = namedRangeMap.get(name);
return namedRange == null ? null : namedRange.formula;
}
public SharedStrings getSharedStrings() {
ensureReadShared();
return sharedStrings;
}
public XSSFReaderFormats getStyles() {
ensureReadShared();
return styles;
}
@Override
public ExcelSheet getSheetAt(int sheetIndex) {
if (sheetIndex < 0 || sheetIndex >= sheetInfos.size()) {
throw new IllegalArgumentException("Sheet index out of range: " + sheetIndex);
}
var sheetInfo = sheetInfos.get(sheetIndex);
return new XSSFReaderSheet(sheetIndex, sheetInfo.name, sheetInfo.relID, this);
}
@Override
public void close() throws IOException {
// Nothing to do
}
}

View File

@ -7,9 +7,6 @@ import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Name;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.util.CellReference;
import org.enso.table.data.column.builder.Builder;
import org.enso.table.data.column.builder.InferredBuilder;
@ -24,6 +21,7 @@ import org.enso.table.excel.ExcelHeaders;
import org.enso.table.excel.ExcelRange;
import org.enso.table.excel.ExcelRow;
import org.enso.table.excel.ExcelSheet;
import org.enso.table.excel.ExcelWorkbook;
import org.enso.table.excel.ReadOnlyExcelConnection;
import org.enso.table.problems.ProblemAggregator;
import org.graalvm.polyglot.Context;
@ -38,18 +36,17 @@ public class ExcelReader {
* @return a String[] containing the sheet names.
* @throws IOException when the action fails
*/
public static String[] readSheetNames(File file, ExcelFileFormat format)
throws IOException, InvalidFormatException {
public static String[] readSheetNames(File file, ExcelFileFormat format) throws IOException {
return withWorkbook(file, format, ExcelReader::readSheetNames);
}
/**
* Reads a list of sheet names from a workbook into an array.
*
* @param workbook a {@link Workbook} to read the sheet names from.
* @param workbook a {@link ExcelWorkbook} to read the sheet names from.
* @return a String[] containing the sheet names.
*/
public static String[] readSheetNames(Workbook workbook) {
public static String[] readSheetNames(ExcelWorkbook workbook) {
int sheetCount = workbook.getNumberOfSheets();
var output = new String[sheetCount];
Context context = Context.getCurrent();
@ -68,20 +65,8 @@ public class ExcelReader {
* @return a String[] containing the range names.
* @throws IOException when the action fails
*/
public static String[] readRangeNames(File file, ExcelFileFormat format)
throws IOException, InvalidFormatException {
return withWorkbook(file, format, ExcelReader::readRangeNames);
}
/**
* Reads a list of range names for the specified XLSX/XLS file into an array.
*
* @param workbook a {@link Workbook} to read the sheet names from.
* @return a String[] containing the range names.
*/
public static String[] readRangeNames(Workbook workbook) {
var names = workbook.getAllNames();
return names.stream().map(Name::getNameName).toArray(String[]::new);
public static String[] readRangeNames(File file, ExcelFileFormat format) throws IOException {
return withWorkbook(file, format, ExcelWorkbook::getRangeNames);
}
/**
@ -202,7 +187,7 @@ public class ExcelReader {
/**
* Reads a range by sheet name, named range or address for the workbook into a table.
*
* @param workbook a {@link Workbook} to read from.
* @param workbook a {@link ExcelWorkbook} to read from.
* @param rangeNameOrAddress sheet name, range name or address to read.
* @param headers specifies whether the first row should be used as headers.
* @param skip_rows skip rows from the top of the range.
@ -211,7 +196,7 @@ public class ExcelReader {
* @throws InvalidLocationException when the range name or address is not found.
*/
public static Table readRangeByName(
Workbook workbook,
ExcelWorkbook workbook,
String rangeNameOrAddress,
ExcelHeaders.HeaderBehavior headers,
int skip_rows,
@ -230,11 +215,10 @@ public class ExcelReader {
problemAggregator);
}
Name name = workbook.getName(rangeNameOrAddress);
ExcelRange excelRange;
try {
excelRange = new ExcelRange(name == null ? rangeNameOrAddress : name.getRefersToFormula());
var formula = workbook.getNameFormula(rangeNameOrAddress);
excelRange = new ExcelRange(formula == null ? rangeNameOrAddress : formula);
} catch (IllegalArgumentException e) {
throw new InvalidLocationException(
rangeNameOrAddress,
@ -271,8 +255,8 @@ public class ExcelReader {
readRange(workbook, excelRange, headers, skip_rows, row_limit, problemAggregator));
}
private static <T> T withWorkbook(File file, ExcelFileFormat format, Function<Workbook, T> action)
throws IOException {
private static <T> T withWorkbook(
File file, ExcelFileFormat format, Function<ExcelWorkbook, T> action) throws IOException {
try (ReadOnlyExcelConnection connection =
ExcelConnectionPool.INSTANCE.openReadOnlyConnection(file, format)) {
return connection.withWorkbook(action);
@ -280,7 +264,7 @@ public class ExcelReader {
}
public static Table readRange(
Workbook workbook,
ExcelWorkbook workbook,
ExcelRange excelRange,
ExcelHeaders.HeaderBehavior headers,
int skip_rows,
@ -304,7 +288,7 @@ public class ExcelReader {
}
private static Table readTable(
Workbook workbook,
ExcelWorkbook workbook,
int sheetIndex,
ExcelRange excelRange,
ExcelHeaders.HeaderBehavior headers,
@ -312,7 +296,7 @@ public class ExcelReader {
int rowCount,
ProblemAggregator problemAggregator) {
ExcelSheet sheet = new ExcelSheet(workbook, sheetIndex);
ExcelSheet sheet = workbook.getSheetAt(sheetIndex);
// Expand Single Cell
if (excelRange != null && excelRange.isSingleCell()) {

View File

@ -77,7 +77,8 @@ public class ExcelWriter {
headers =
headers != ExcelHeaders.HeaderBehavior.INFER
? headers
: shouldWriteHeaders(new ExcelSheet(workbook, sheetIndex), firstRow + 1, 1, -1);
: shouldWriteHeaders(
ExcelSheet.forPOIUserModel(workbook, sheetIndex), firstRow + 1, 1, -1);
String sheetName = workbook.getSheetName(sheetIndex - 1);
workbook.removeSheetAt(sheetIndex - 1);
@ -130,7 +131,8 @@ public class ExcelWriter {
headers =
headers != ExcelHeaders.HeaderBehavior.INFER
? headers
: shouldWriteHeaders(new ExcelSheet(workbook, sheetIndex), firstRow + 1, 1, -1);
: shouldWriteHeaders(
ExcelSheet.forPOIUserModel(workbook, sheetIndex), firstRow + 1, 1, -1);
workbook.removeSheetAt(sheetIndex);
Sheet sheet = workbook.createSheet(sheetName);
@ -198,7 +200,7 @@ public class ExcelWriter {
throw new InvalidLocationException(
range.getSheetName(), "Unknown sheet '" + range.getSheetName() + "'.");
}
ExcelSheet sheet = new ExcelSheet(workbook, sheetIndex);
ExcelSheet sheet = ExcelSheet.forPOIUserModel(workbook, sheetIndex);
if (skipRows != 0) {
if (range.isWholeColumn()) {