mirror of
https://github.com/enso-org/enso.git
synced 2024-12-19 18:41:45 +03:00
Excel Reading (XLSX) using SAX Default Handler (#10877)
- Implement Excel reading as a SAXMLParser.
This commit is contained in:
parent
e6bcd5e485
commit
63ed629210
@ -49,7 +49,7 @@ type Excel_Workbook
|
||||
- file: The file to load.
|
||||
- xls_format: Whether to use the old XLS format (default is XLSX).
|
||||
new : File | Temporary_File -> Boolean -> Excel_Workbook
|
||||
new file:(File | Temporary_File) xls_format=False =
|
||||
new file:(File | Temporary_File) xls_format:Boolean=False =
|
||||
file_for_errors = if file.is_a Temporary_File then Nothing else file
|
||||
|
||||
continuation raw_file =
|
||||
@ -73,7 +73,7 @@ type Excel_Workbook
|
||||
- xls_format: Whether to use the old XLS format (default is XLSX).
|
||||
- file: Optional file reference.
|
||||
from_stream : Input_Stream -> Boolean -> File | Nothing -> Excel_Workbook
|
||||
from_stream stream xls_format=False file=Nothing = Excel_Reader.handle_bad_format file <|
|
||||
from_stream stream xls_format:Boolean=False file=Nothing = Excel_Reader.handle_bad_format file <|
|
||||
temp_file = Temporary_File.from_stream_light stream
|
||||
Excel_Workbook.new temp_file xls_format
|
||||
|
||||
@ -89,8 +89,8 @@ type Excel_Workbook
|
||||
## PRIVATE
|
||||
ICON metadata
|
||||
Returns the list of databases (or catalogs) for the connection.
|
||||
databases : Nothing
|
||||
databases self = Nothing
|
||||
databases : Vector (Text | Nothing)
|
||||
databases self = [Nothing]
|
||||
|
||||
## PRIVATE
|
||||
ICON metadata
|
||||
@ -109,7 +109,7 @@ type Excel_Workbook
|
||||
Arguments:
|
||||
- database: The target file to open as an Excel_Workbook.
|
||||
set_database : Text | File -> Excel_Workbook ! Illegal_Argument
|
||||
set_database self database =
|
||||
set_database self database:(Text | File) =
|
||||
if database == self.database then self else
|
||||
file = File.new database
|
||||
if file.exists && file.is_directory.not then Excel_Workbook.new file self.xls_format else
|
||||
@ -163,7 +163,7 @@ type Excel_Workbook
|
||||
Gets the names of all the named ranges.
|
||||
named_ranges : Vector Text
|
||||
named_ranges self = self.with_java_workbook java_workbook->
|
||||
Vector.from_polyglot_array (ExcelReader.readRangeNames java_workbook)
|
||||
Vector.from_polyglot_array java_workbook.getRangeNames
|
||||
|
||||
## PRIVATE
|
||||
ICON metadata
|
||||
|
@ -20,6 +20,7 @@ import org.apache.poi.openxml4j.opc.PackageAccess;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.ss.usermodel.Workbook;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.enso.table.excel.xssfreader.XSSFReaderWorkbook;
|
||||
|
||||
public class ExcelConnectionPool {
|
||||
public static final ExcelConnectionPool INSTANCE = new ExcelConnectionPool();
|
||||
@ -64,7 +65,7 @@ public class ExcelConnectionPool {
|
||||
record.refCount = 1;
|
||||
record.file = file;
|
||||
record.format = format;
|
||||
record.workbook = openWorkbook(file, format, false);
|
||||
record.reopen(true);
|
||||
records.put(key, record);
|
||||
return new ReadOnlyExcelConnection(this, key, record);
|
||||
}
|
||||
@ -212,10 +213,10 @@ public class ExcelConnectionPool {
|
||||
private int refCount;
|
||||
private File file;
|
||||
private ExcelFileFormat format;
|
||||
private Workbook workbook;
|
||||
private ExcelWorkbook workbook;
|
||||
private IOException initializationException = null;
|
||||
|
||||
<T> T withWorkbook(Function<Workbook, T> action) throws IOException {
|
||||
<T> T withWorkbook(Function<ExcelWorkbook, T> action) throws IOException {
|
||||
synchronized (this) {
|
||||
return action.apply(accessCurrentWorkbook());
|
||||
}
|
||||
@ -238,7 +239,10 @@ public class ExcelConnectionPool {
|
||||
}
|
||||
|
||||
try {
|
||||
workbook = openWorkbook(file, format, false);
|
||||
workbook =
|
||||
format == ExcelFileFormat.XLSX
|
||||
? new XSSFReaderWorkbook(file.getAbsolutePath())
|
||||
: ExcelWorkbook.forPOIUserModel(openWorkbook(file, format, false));
|
||||
} catch (IOException e) {
|
||||
initializationException = e;
|
||||
if (throwOnFailure) {
|
||||
@ -248,7 +252,7 @@ public class ExcelConnectionPool {
|
||||
}
|
||||
}
|
||||
|
||||
private Workbook accessCurrentWorkbook() throws IOException {
|
||||
private ExcelWorkbook accessCurrentWorkbook() throws IOException {
|
||||
synchronized (this) {
|
||||
if (workbook == null) {
|
||||
if (initializationException != null) {
|
||||
@ -278,7 +282,7 @@ public class ExcelConnectionPool {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
case XLSX -> {
|
||||
case XLSX, XLSX_FALLBACK -> {
|
||||
try {
|
||||
PackageAccess access = writeAccess ? PackageAccess.READ_WRITE : PackageAccess.READ;
|
||||
OPCPackage pkg = OPCPackage.open(file, access);
|
||||
@ -300,7 +304,7 @@ public class ExcelConnectionPool {
|
||||
private static Workbook createEmptyWorkbook(ExcelFileFormat format) {
|
||||
return switch (format) {
|
||||
case XLS -> new HSSFWorkbook();
|
||||
case XLSX -> new XSSFWorkbook();
|
||||
case XLSX, XLSX_FALLBACK -> new XSSFWorkbook();
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -2,5 +2,6 @@ package org.enso.table.excel;
|
||||
|
||||
public enum ExcelFileFormat {
|
||||
XLS,
|
||||
XLSX
|
||||
XLSX,
|
||||
XLSX_FALLBACK
|
||||
}
|
||||
|
@ -57,7 +57,7 @@ public class ExcelHeaders {
|
||||
|
||||
String[] output = new String[currentEndCol - startCol + 1];
|
||||
for (int col = startCol; col <= currentEndCol; col++) {
|
||||
String cellText = row.getFormattedCell(col);
|
||||
String cellText = row.getCellText(col);
|
||||
String name = cellText.isEmpty() ? "" : deduplicator.makeUnique(cellText);
|
||||
|
||||
output[col - startCol] = name;
|
||||
|
@ -197,7 +197,7 @@ public class ExcelRange {
|
||||
|
||||
Context context = Context.getCurrent();
|
||||
while (currentRow != null && !currentRow.isEmpty(excelRange.getLeftColumn(), rightColumn)) {
|
||||
rightColumn = currentRow.findEndRight(rightColumn);
|
||||
rightColumn = findEndRight(currentRow, rightColumn);
|
||||
bottomRow++;
|
||||
currentRow = sheet.get(bottomRow);
|
||||
|
||||
@ -212,6 +212,16 @@ public class ExcelRange {
|
||||
bottomRow - 1);
|
||||
}
|
||||
|
||||
private static int findEndRight(ExcelRow row, int start) {
|
||||
Context context = Context.getCurrent();
|
||||
int column = start;
|
||||
while (!row.isEmpty(column + 1)) {
|
||||
column++;
|
||||
context.safepoint();
|
||||
}
|
||||
return column;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param index The index to the next character after the parsed value
|
||||
* @param value Parsed integer value or 0 if not valid
|
||||
|
@ -10,24 +10,63 @@ import org.apache.poi.ss.usermodel.DateUtil;
|
||||
import org.apache.poi.ss.usermodel.ExcelNumberFormat;
|
||||
import org.apache.poi.ss.usermodel.FormulaError;
|
||||
import org.apache.poi.ss.usermodel.Row;
|
||||
import org.apache.poi.ss.usermodel.Sheet;
|
||||
import org.graalvm.polyglot.Context;
|
||||
|
||||
/** Wrapper class to handle Excel rows. */
|
||||
public class ExcelRow {
|
||||
private static final DataFormatter formatter = new DataFormatter();
|
||||
public interface ExcelRow {
|
||||
/** Gets the initial column index within the row (1-based). */
|
||||
int getFirstColumn();
|
||||
|
||||
private final Row row;
|
||||
private final int firstColumn;
|
||||
private final int lastColumn;
|
||||
private final boolean use1904Format;
|
||||
/** Gets the final column index within the row (1-based). */
|
||||
int getLastColumn();
|
||||
|
||||
public ExcelRow(Row row, boolean use1904Format) {
|
||||
this.row = row;
|
||||
this.firstColumn = row.getFirstCellNum() + 1;
|
||||
this.lastColumn = row.getLastCellNum();
|
||||
this.use1904Format = use1904Format;
|
||||
/** Gets the cell at the given index within the row (1-based). */
|
||||
Object getCellValue(int column);
|
||||
|
||||
/** Gets the text of a cell at the given index within the row (1-based). */
|
||||
String getCellText(int column);
|
||||
|
||||
/** Gets the cell at the given index within the row (1-based). */
|
||||
Cell get(int column);
|
||||
|
||||
/** Checks if the specified cell is empty. */
|
||||
boolean isEmpty(int column);
|
||||
|
||||
/** Checks if the specified set of cells are empty. */
|
||||
boolean isEmpty(int start, int end);
|
||||
|
||||
/** Gets the cells as text. */
|
||||
String[] getCellsAsText(int startCol, int endCol);
|
||||
|
||||
/** Gets the underlying Apache POI Sheet object. */
|
||||
static ExcelRow forPOIUserModel(Sheet sheet, int rowIndex, boolean use1904Format) {
|
||||
var row = sheet.getRow(rowIndex - 1);
|
||||
return row == null
|
||||
? null
|
||||
: new ExcelRowFromPOIUserModel(
|
||||
row, row.getFirstCellNum() + 1, row.getLastCellNum(), use1904Format);
|
||||
}
|
||||
|
||||
static boolean isEmptyHelper(ExcelRow row, int start, int end) {
|
||||
Context context = Context.getCurrent();
|
||||
int currentEnd = end == -1 ? row.getLastColumn() : end;
|
||||
for (int column = Math.max(row.getFirstColumn(), start);
|
||||
column <= Math.min(row.getLastColumn(), currentEnd);
|
||||
column++) {
|
||||
if (!row.isEmpty(column)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
context.safepoint();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
record ExcelRowFromPOIUserModel(Row row, int firstColumn, int lastColumn, boolean use1904Format)
|
||||
implements ExcelRow {
|
||||
private static final DataFormatter formatter = new DataFormatter();
|
||||
|
||||
public int getFirstColumn() {
|
||||
return firstColumn;
|
||||
}
|
||||
@ -93,52 +132,8 @@ public class ExcelRow {
|
||||
}
|
||||
}
|
||||
|
||||
public static CellType getCellType(Cell cell) {
|
||||
if (cell == null) {
|
||||
return CellType._NONE;
|
||||
}
|
||||
|
||||
CellType cellType = cell.getCellType();
|
||||
if (cellType == CellType.FORMULA) {
|
||||
cellType = cell.getCachedFormulaResultType();
|
||||
}
|
||||
|
||||
return cellType;
|
||||
}
|
||||
|
||||
public boolean isEmpty(int column) {
|
||||
CellType cellType = getCellType(get(column));
|
||||
return (cellType == CellType._NONE) || (cellType == CellType.BLANK);
|
||||
}
|
||||
|
||||
public boolean isEmpty(int start, int end) {
|
||||
Context context = Context.getCurrent();
|
||||
int currentEnd = end == -1 ? getLastColumn() : end;
|
||||
for (int column = Math.max(getFirstColumn(), start);
|
||||
column <= Math.min(getLastColumn(), currentEnd);
|
||||
column++) {
|
||||
if (!isEmpty(column)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
context.safepoint();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public int findEndRight(int start) {
|
||||
Context context = Context.getCurrent();
|
||||
int column = start;
|
||||
while (!isEmpty(column + 1)) {
|
||||
column++;
|
||||
context.safepoint();
|
||||
}
|
||||
return column;
|
||||
}
|
||||
|
||||
/** Returns the formatted cell value. */
|
||||
public String getFormattedCell(int col) {
|
||||
var cell = get(col);
|
||||
public String getCellText(int column) {
|
||||
var cell = get(column);
|
||||
if (cell == null) {
|
||||
return "";
|
||||
}
|
||||
@ -161,12 +156,21 @@ public class ExcelRow {
|
||||
}
|
||||
default -> {
|
||||
// Use the default read and then toString.
|
||||
var value = getCellValue(col);
|
||||
var value = getCellValue(column);
|
||||
yield value == null ? "" : value.toString();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public boolean isEmpty(int column) {
|
||||
CellType cellType = getCellType(get(column));
|
||||
return (cellType == CellType._NONE) || (cellType == CellType.BLANK);
|
||||
}
|
||||
|
||||
public boolean isEmpty(int start, int end) {
|
||||
return isEmptyHelper(this, start, end);
|
||||
}
|
||||
|
||||
public String[] getCellsAsText(int startCol, int endCol) {
|
||||
Context context = Context.getCurrent();
|
||||
int currentEndCol = endCol == -1 ? getLastColumn() : endCol;
|
||||
@ -174,7 +178,7 @@ public class ExcelRow {
|
||||
String[] output = new String[currentEndCol - startCol + 1];
|
||||
for (int col = startCol; col <= currentEndCol; col++) {
|
||||
Cell cell = get(col);
|
||||
CellType type = ExcelRow.getCellType(cell);
|
||||
CellType type = getCellType(cell);
|
||||
if (type != CellType._NONE && type != CellType.BLANK && type != CellType.STRING) {
|
||||
return null;
|
||||
}
|
||||
@ -185,4 +189,18 @@ public class ExcelRow {
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
private static CellType getCellType(Cell cell) {
|
||||
if (cell == null) {
|
||||
return CellType._NONE;
|
||||
}
|
||||
|
||||
CellType cellType = cell.getCellType();
|
||||
if (cellType == CellType.FORMULA) {
|
||||
cellType = cell.getCachedFormulaResultType();
|
||||
}
|
||||
|
||||
return cellType;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,37 +1,83 @@
|
||||
package org.enso.table.excel;
|
||||
|
||||
import org.apache.poi.ss.usermodel.Row;
|
||||
import org.apache.poi.ss.usermodel.Sheet;
|
||||
import org.apache.poi.ss.usermodel.Workbook;
|
||||
|
||||
/** Wrapper class to handle Excel sheets. */
|
||||
public class ExcelSheet {
|
||||
private final Sheet sheet;
|
||||
private final int firstRow;
|
||||
private final int lastRow;
|
||||
private final boolean use1904Format;
|
||||
public interface ExcelSheet {
|
||||
/** Gets the index of the sheet within the workbook (0-based). */
|
||||
int getSheetIndex();
|
||||
|
||||
public ExcelSheet(Workbook workbook, int sheetIndex) {
|
||||
this.sheet = workbook.getSheetAt(sheetIndex);
|
||||
this.firstRow = sheet.getFirstRowNum() + 1;
|
||||
this.lastRow = sheet.getLastRowNum() + 1;
|
||||
this.use1904Format = ExcelUtils.is1904DateSystem(workbook);
|
||||
/** Gets the name of the sheet. */
|
||||
String getName();
|
||||
|
||||
/** Gets the initial row index within the sheet (1-based). */
|
||||
int getFirstRow();
|
||||
|
||||
/** Gets the final row index within the sheet (1-based). */
|
||||
int getLastRow();
|
||||
|
||||
/**
|
||||
* Gets the row at the given index within the sheet (1-based)
|
||||
*
|
||||
* @param row the row index (1-based)/
|
||||
* @return the row object or null if the row index is out of range or doesn't exist.
|
||||
*/
|
||||
ExcelRow get(int row);
|
||||
|
||||
/** Gets the underlying Apache POI Sheet object - may be null. Provided for Writer use only. */
|
||||
Sheet getSheet();
|
||||
|
||||
/** Gets the underlying Apache POI Sheet object. */
|
||||
static ExcelSheet forPOIUserModel(Workbook workbook, int sheetIndex) {
|
||||
var sheet = workbook.getSheetAt(sheetIndex);
|
||||
return new ExcelSheetFromPOIUserModel(
|
||||
sheet,
|
||||
sheetIndex,
|
||||
sheet.getSheetName(),
|
||||
sheet.getFirstRowNum() + 1,
|
||||
sheet.getLastRowNum() + 1,
|
||||
ExcelUtils.is1904DateSystem(workbook));
|
||||
}
|
||||
|
||||
public int getLastRow() {
|
||||
return lastRow;
|
||||
record ExcelSheetFromPOIUserModel(
|
||||
Sheet sheet,
|
||||
int sheetIndex,
|
||||
String sheetName,
|
||||
int firstRow,
|
||||
int lastRow,
|
||||
boolean use1904Format)
|
||||
implements ExcelSheet {
|
||||
@Override
|
||||
public int getSheetIndex() {
|
||||
return sheetIndex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return sheetName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getFirstRow() {
|
||||
return firstRow;
|
||||
}
|
||||
|
||||
public ExcelRow get(int row) {
|
||||
Row underlyingRow = row < firstRow || row > lastRow ? null : sheet.getRow(row - 1);
|
||||
return underlyingRow == null ? null : new ExcelRow(underlyingRow, use1904Format);
|
||||
@Override
|
||||
public int getLastRow() {
|
||||
return lastRow;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExcelRow get(int row) {
|
||||
return row < firstRow || row > lastRow
|
||||
? null
|
||||
: ExcelRow.forPOIUserModel(sheet, row, use1904Format);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sheet getSheet() {
|
||||
return sheet;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,10 @@
|
||||
package org.enso.table.excel;
|
||||
|
||||
import java.time.*;
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.LocalTime;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.time.temporal.Temporal;
|
||||
import org.apache.poi.ss.usermodel.Workbook;
|
||||
|
@ -0,0 +1,123 @@
|
||||
package org.enso.table.excel;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.poi.ss.usermodel.Name;
|
||||
|
||||
/** Represents an Excel workbook. Wraps the underlying Apache POI Workbook object. */
|
||||
public interface ExcelWorkbook {
|
||||
/**
|
||||
* Get the number of spreadsheets in the workbook
|
||||
*
|
||||
* @return the number of sheets
|
||||
*/
|
||||
int getNumberOfSheets();
|
||||
|
||||
/**
|
||||
* Returns the index of the sheet by its name
|
||||
*
|
||||
* @param name the sheet name
|
||||
* @return index of the sheet (0 based)
|
||||
*/
|
||||
int getSheetIndex(String name);
|
||||
|
||||
/**
|
||||
* Get the sheet name
|
||||
*
|
||||
* @param sheet sheet number (0 based)
|
||||
* @return Sheet name
|
||||
*/
|
||||
String getSheetName(int sheet);
|
||||
|
||||
/**
|
||||
* @return the total number of defined names in this workbook
|
||||
*/
|
||||
int getNumberOfNames();
|
||||
|
||||
/**
|
||||
* Get all the range names in the workbook
|
||||
*
|
||||
* @return an array of range names
|
||||
*/
|
||||
String[] getRangeNames();
|
||||
|
||||
/**
|
||||
* Get the formula for a named range.
|
||||
*
|
||||
* @param name the name of the range.
|
||||
* @return the formula for the range or null if not found.
|
||||
*/
|
||||
String getNameFormula(String name);
|
||||
|
||||
/**
|
||||
* Get a sheet by its index
|
||||
*
|
||||
* @param sheetIndex the index of the sheet (0 based)
|
||||
* @return the sheet as an ExcelSheet object
|
||||
* @throws IllegalArgumentException if the sheet index is out of range.
|
||||
*/
|
||||
ExcelSheet getSheetAt(int sheetIndex);
|
||||
|
||||
/**
|
||||
* Close the underlying input resource (File or Stream), from which the Workbook was read.
|
||||
*
|
||||
* <p>Once this has been called, no further operations, updates or reads should be performed on
|
||||
* the Workbook.
|
||||
*/
|
||||
void close() throws IOException;
|
||||
|
||||
/**
|
||||
* Create an ExcelWorkbook object from an Apache POI Workbook object
|
||||
*
|
||||
* @param workbook the Apache POI Workbook object
|
||||
* @return the ExcelWorkbook object
|
||||
*/
|
||||
static ExcelWorkbook forPOIUserModel(org.apache.poi.ss.usermodel.Workbook workbook) {
|
||||
return new ExcelWorkbookFromPOIUserModel(workbook);
|
||||
}
|
||||
|
||||
// ** Wrap a Workbook object in the interface. */
|
||||
record ExcelWorkbookFromPOIUserModel(org.apache.poi.ss.usermodel.Workbook workbook)
|
||||
implements ExcelWorkbook {
|
||||
@Override
|
||||
public int getNumberOfSheets() {
|
||||
return workbook.getNumberOfSheets();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getSheetIndex(String name) {
|
||||
return workbook.getSheetIndex(name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSheetName(int sheet) {
|
||||
return workbook.getSheetName(sheet);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumberOfNames() {
|
||||
return workbook.getNumberOfNames();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] getRangeNames() {
|
||||
var names = workbook.getAllNames();
|
||||
return names.stream().map(Name::getNameName).toArray(String[]::new);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getNameFormula(String name) {
|
||||
var namedRange = workbook.getName(name);
|
||||
return namedRange == null ? null : namedRange.getRefersToFormula();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExcelSheet getSheetAt(int sheetIndex) {
|
||||
return ExcelSheet.forPOIUserModel(workbook, sheetIndex);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
workbook.close();
|
||||
}
|
||||
}
|
||||
}
|
@ -2,7 +2,6 @@ package org.enso.table.excel;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.function.Function;
|
||||
import org.apache.poi.ss.usermodel.Workbook;
|
||||
|
||||
public class ReadOnlyExcelConnection implements AutoCloseable {
|
||||
|
||||
@ -28,7 +27,7 @@ public class ReadOnlyExcelConnection implements AutoCloseable {
|
||||
record = null;
|
||||
}
|
||||
|
||||
public synchronized <T> T withWorkbook(Function<Workbook, T> f) throws IOException {
|
||||
public synchronized <T> T withWorkbook(Function<ExcelWorkbook, T> f) throws IOException {
|
||||
if (record == null) {
|
||||
throw new IllegalStateException("ReadOnlyExcelConnection is being used after it was closed.");
|
||||
}
|
||||
|
@ -0,0 +1,29 @@
|
||||
package org.enso.table.excel.xssfreader;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import org.apache.poi.xssf.model.StylesTable;
|
||||
|
||||
/** Provides the format strings for number formats in an XSSF workbook. */
|
||||
public class XSSFReaderFormats {
|
||||
private final StylesTable stylesTable;
|
||||
private final Map<Short, String> numberFormats = new HashMap<>();
|
||||
|
||||
public XSSFReaderFormats(StylesTable stylesTable) {
|
||||
this.stylesTable = stylesTable;
|
||||
}
|
||||
|
||||
public String getNumberFormatAt(short styleIdx) {
|
||||
if (numberFormats.containsKey(styleIdx)) {
|
||||
return numberFormats.get(styleIdx);
|
||||
}
|
||||
|
||||
var style = stylesTable.getStyleAt(styleIdx);
|
||||
var format = style == null ? "General" : style.getDataFormatString();
|
||||
if (format == null || format.equals("General")) {
|
||||
format = "";
|
||||
}
|
||||
numberFormats.put(styleIdx, format);
|
||||
return format;
|
||||
}
|
||||
}
|
@ -0,0 +1,125 @@
|
||||
package org.enso.table.excel.xssfreader;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.SortedMap;
|
||||
import org.apache.poi.ss.usermodel.Cell;
|
||||
import org.apache.poi.ss.usermodel.DataFormatter;
|
||||
import org.enso.table.excel.ExcelRow;
|
||||
|
||||
public class XSSFReaderRow implements ExcelRow {
|
||||
private static final DataFormatter formatter = new DataFormatter();
|
||||
private final SortedMap<Short, XSSFReaderSheetXMLHandler.CellValue> data;
|
||||
private final boolean use1904Dates;
|
||||
|
||||
public XSSFReaderRow(
|
||||
SortedMap<Short, XSSFReaderSheetXMLHandler.CellValue> data, boolean use1904Dates) {
|
||||
this.data = data;
|
||||
this.use1904Dates = use1904Dates;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getFirstColumn() {
|
||||
return data.firstKey();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getLastColumn() {
|
||||
return data.lastKey();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Cell get(int column) {
|
||||
// Not supported as we don't have the underlying Apache POI Cell object.
|
||||
throw new UnsupportedOperationException("XSSFReader does not support getting the Cell object.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getCellValue(int column) {
|
||||
var cell = data.get((short) column);
|
||||
if (cell == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var dataType = cell.dataType();
|
||||
return switch (dataType) {
|
||||
case BLANK -> null;
|
||||
case BOOL -> cell.getBooleanValue();
|
||||
case DATE -> LocalDateTime.parse(cell.strValue()); // Don't believe used by Excel.
|
||||
case INLINE_STRING, SST_STRING, FORMULA_STRING -> cell.strValue();
|
||||
case INTEGER -> cell.getIntegerValue();
|
||||
case NUMBER -> {
|
||||
double dbl = cell.getNumberValue();
|
||||
long longVal = (long) dbl;
|
||||
if (dbl == longVal) {
|
||||
yield (long) dbl;
|
||||
} else {
|
||||
yield dbl;
|
||||
}
|
||||
}
|
||||
case OLE_DATE -> cell.getDateValue(use1904Dates);
|
||||
case OLE_DATETIME -> cell.getDateTimeValue(use1904Dates);
|
||||
case ERROR -> null;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getCellText(int column) {
|
||||
var cell = data.get((short) column);
|
||||
if (cell == null) {
|
||||
return "";
|
||||
}
|
||||
|
||||
var dataType = cell.dataType();
|
||||
return switch (dataType) {
|
||||
case BLANK -> "";
|
||||
case NUMBER, OLE_DATETIME, OLE_DATE, INTEGER -> {
|
||||
// Special handling for Number or Date cells as want to keep formatting.
|
||||
var formatText = cell.format();
|
||||
if (formatText == null || formatText.isEmpty()) {
|
||||
yield cell.strValue();
|
||||
}
|
||||
yield formatter.formatRawCellContents(cell.getNumberValue(), -1, formatText, use1904Dates);
|
||||
}
|
||||
case BOOL -> cell.getBooleanValue() ? "TRUE" : "FALSE";
|
||||
default -> cell.strValue();
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty(int column) {
|
||||
var cell = data.get((short) column);
|
||||
return cell == null || cell.strValue().isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty(int start, int end) {
|
||||
int currentEnd = end == -1 ? getLastColumn() : end;
|
||||
for (int column = Math.max(getFirstColumn(), start);
|
||||
column <= Math.min(getLastColumn(), currentEnd);
|
||||
column++) {
|
||||
if (!isEmpty(column)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] getCellsAsText(int startCol, int endCol) {
|
||||
int currentEndCol = endCol == -1 ? getLastColumn() : endCol;
|
||||
|
||||
String[] output = new String[currentEndCol - startCol + 1];
|
||||
for (int col = startCol; col <= currentEndCol; col++) {
|
||||
|
||||
var cell = data.get((short) col);
|
||||
if (cell != null && !cell.dataType().isString()) {
|
||||
// Short circuit if find not a string cell.
|
||||
return null;
|
||||
}
|
||||
|
||||
output[col - startCol] = cell == null ? "" : cell.strValue();
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
}
|
@ -0,0 +1,150 @@
|
||||
package org.enso.table.excel.xssfreader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
import org.apache.poi.ss.usermodel.Sheet;
|
||||
import org.apache.poi.util.XMLHelper;
|
||||
import org.enso.table.excel.ExcelRow;
|
||||
import org.enso.table.excel.ExcelSheet;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
public class XSSFReaderSheet implements ExcelSheet {
|
||||
private final int sheetIdx;
|
||||
private final String sheetName;
|
||||
private final String relId;
|
||||
private final XSSFReaderWorkbook parent;
|
||||
|
||||
private boolean hasReadSheetData = false;
|
||||
private String dimensions;
|
||||
private int firstRow;
|
||||
private int lastRow;
|
||||
private Map<Integer, SortedMap<Short, XSSFReaderSheetXMLHandler.CellValue>> rowData;
|
||||
|
||||
public XSSFReaderSheet(int sheetIdx, String sheetName, String relId, XSSFReaderWorkbook parent) {
|
||||
this.sheetIdx = sheetIdx;
|
||||
this.sheetName = sheetName;
|
||||
this.relId = relId;
|
||||
this.parent = parent;
|
||||
}
|
||||
|
||||
private synchronized void ensureReadSheetData() {
|
||||
if (hasReadSheetData) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
var strings = parent.getSharedStrings();
|
||||
var styles = parent.getStyles();
|
||||
var handler =
|
||||
new XSSFReaderSheetXMLHandler(styles, strings) {
|
||||
@Override
|
||||
protected void onDimensions(String dimension) {
|
||||
handleOnDimensions(dimension);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onStartRow(int rowNum) {
|
||||
handleOnStartRow(rowNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onCell(int rowNumber, short columnNumber, String ref, CellValue value) {
|
||||
handleOnCell(rowNumber, columnNumber, value);
|
||||
}
|
||||
};
|
||||
|
||||
var xmlReader = XMLHelper.newXMLReader();
|
||||
xmlReader.setContentHandler(handler);
|
||||
|
||||
rowData = new HashMap<>();
|
||||
|
||||
try {
|
||||
parent.withReader(
|
||||
reader -> {
|
||||
try {
|
||||
var sheet = reader.getSheet(relId);
|
||||
xmlReader.parse(new InputSource(sheet));
|
||||
} catch (SAXException | InvalidFormatException | IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
hasReadSheetData = true;
|
||||
} catch (SAXException | ParserConfigurationException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getSheetIndex() {
|
||||
return sheetIdx;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return sheetName;
|
||||
}
|
||||
|
||||
public String getDimensions() {
|
||||
ensureReadSheetData();
|
||||
return dimensions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getFirstRow() {
|
||||
ensureReadSheetData();
|
||||
return firstRow;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getLastRow() {
|
||||
ensureReadSheetData();
|
||||
return lastRow;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExcelRow get(int row) {
|
||||
ensureReadSheetData();
|
||||
|
||||
if (!rowData.containsKey(row)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new XSSFReaderRow(rowData.get(row), parent.use1904Format());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sheet getSheet() {
|
||||
// Not supported as we don't have the underlying Apache POI Sheet object.
|
||||
throw new UnsupportedOperationException(
|
||||
"XSSFReader does not support getting the Sheet object.");
|
||||
}
|
||||
|
||||
protected void handleOnDimensions(String dimension) {
|
||||
dimensions = dimension;
|
||||
}
|
||||
|
||||
private void handleOnStartRow(int rowNum) {
|
||||
if (firstRow == 0 || rowNum < firstRow) {
|
||||
firstRow = rowNum;
|
||||
}
|
||||
|
||||
if (lastRow == 0 || rowNum > lastRow) {
|
||||
lastRow = rowNum;
|
||||
}
|
||||
}
|
||||
|
||||
private void handleOnCell(
|
||||
int rowNumber, short columnNumber, XSSFReaderSheetXMLHandler.CellValue value) {
|
||||
rowData.computeIfAbsent(rowNumber, k -> new TreeMap<>()).put(columnNumber, value);
|
||||
}
|
||||
}
|
@ -0,0 +1,259 @@
|
||||
package org.enso.table.excel.xssfreader;
|
||||
|
||||
import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML;
|
||||
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.temporal.Temporal;
|
||||
import org.apache.poi.ss.usermodel.DateUtil;
|
||||
import org.apache.poi.xssf.model.SharedStrings;
|
||||
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
|
||||
import org.enso.table.excel.ExcelUtils;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
/** Based on the XSSFSheetXMLHandler class from Apache POI. */
|
||||
/**
|
||||
* SAX-based Handler to Read Excel XML on top of POI support. Technical specification can be found
|
||||
* at:
|
||||
* https://learn.microsoft.com/en-us/openspecs/office_standards/ms-oe376/db9b9b72-b10b-4e7e-844c-09f88c972219
|
||||
* https://ecma-international.org/publications-and-standards/standards/ecma-376/
|
||||
*/
|
||||
public class XSSFReaderSheetXMLHandler extends DefaultHandler {
|
||||
private final XSSFReaderFormats styles;
|
||||
private final SharedStrings sharedStrings;
|
||||
|
||||
public enum XSSDataType {
|
||||
BLANK,
|
||||
BOOL,
|
||||
DATE,
|
||||
ERROR,
|
||||
INLINE_STRING,
|
||||
SST_STRING,
|
||||
NUMBER,
|
||||
INTEGER,
|
||||
OLE_DATE,
|
||||
OLE_DATETIME,
|
||||
FORMULA_STRING;
|
||||
|
||||
public boolean isString() {
|
||||
return this == INLINE_STRING || this == SST_STRING || this == FORMULA_STRING;
|
||||
}
|
||||
}
|
||||
|
||||
// Record if seen a value element
|
||||
private boolean seenValue;
|
||||
|
||||
// Set when V start element is seen
|
||||
private boolean vIsOpen;
|
||||
|
||||
// Set when an Inline String "is" is seen
|
||||
private boolean isIsOpen;
|
||||
|
||||
// The current row being read (or -1 if not in a row)
|
||||
private int rowNumber = -1;
|
||||
|
||||
// Handle missing rowNumber in the XML (happens in Excel), first row would be row 1.
|
||||
private int nextRowNumber = 1;
|
||||
|
||||
// The current cell being read (or null if not in a cell)
|
||||
private String cellRef;
|
||||
|
||||
// Set when cell start element is seen, used when cell close element is seen.
|
||||
private XSSDataType dataType;
|
||||
|
||||
// Gathers characters as they are seen.
|
||||
private final StringBuilder value = new StringBuilder(64);
|
||||
private String numberFormat = null;
|
||||
|
||||
public XSSFReaderSheetXMLHandler(XSSFReaderFormats styles, SharedStrings strings) {
|
||||
this.styles = styles;
|
||||
this.sharedStrings = strings;
|
||||
}
|
||||
|
||||
private boolean isTextTag(String name) {
|
||||
return "v".equals(name) || "inlineStr".equals(name) || ("t".equals(name) && isIsOpen);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String qName, Attributes attributes) {
|
||||
if (uri != null && !NS_SPREADSHEETML.equals(uri)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (isTextTag(localName)) {
|
||||
seenValue = true;
|
||||
vIsOpen = true;
|
||||
if (!isIsOpen) {
|
||||
value.setLength(0);
|
||||
}
|
||||
} else {
|
||||
switch (localName) {
|
||||
case "dimension": // Dimensions of sheet
|
||||
var dimension = attributes.getValue("ref");
|
||||
if (dimension != null) {
|
||||
onDimensions(dimension);
|
||||
}
|
||||
break;
|
||||
case "row": // Row
|
||||
String rowNumStr = attributes.getValue("r");
|
||||
rowNumber = rowNumStr == null ? nextRowNumber : Integer.parseInt(rowNumStr);
|
||||
onStartRow(rowNumber);
|
||||
break;
|
||||
case "c": // Cell
|
||||
cellRef = attributes.getValue("r");
|
||||
seenValue = false;
|
||||
|
||||
String cellType = attributes.getValue("t");
|
||||
if (cellType == null) {
|
||||
cellType = "n"; // Number is default
|
||||
}
|
||||
|
||||
dataType =
|
||||
switch (cellType) {
|
||||
case "b" -> XSSDataType.BOOL;
|
||||
case "e" -> XSSDataType.ERROR;
|
||||
case "d" -> XSSDataType.DATE; // Date in ISO 8601 format.
|
||||
case "inlineStr" -> XSSDataType.INLINE_STRING;
|
||||
case "s" -> XSSDataType.SST_STRING;
|
||||
case "str" -> XSSDataType.FORMULA_STRING; // String formula
|
||||
default -> XSSDataType.NUMBER;
|
||||
};
|
||||
|
||||
// Read the format for NUMBER
|
||||
numberFormat = null;
|
||||
if (dataType == XSSDataType.NUMBER) {
|
||||
String cellStyleStr = attributes.getValue("s");
|
||||
if (cellStyleStr != null) {
|
||||
short styleIndex = (short) Integer.parseInt(cellStyleStr);
|
||||
numberFormat = styles.getNumberFormatAt(styleIndex);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case "is": // Inline String
|
||||
isIsOpen = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Captures characters if a suitable element is open. */
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length) {
|
||||
if (vIsOpen) {
|
||||
value.append(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String qName) {
|
||||
if (uri != null && !NS_SPREADSHEETML.equals(uri)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (isTextTag(localName)) {
|
||||
vIsOpen = false;
|
||||
} else {
|
||||
switch (localName) {
|
||||
case "sheetData" -> onSheetEnd();
|
||||
case "row" -> {
|
||||
nextRowNumber = rowNumber + 1;
|
||||
rowNumber = -1;
|
||||
}
|
||||
case "c" -> outputCellValue();
|
||||
case "is" -> isIsOpen = false;
|
||||
case "v" -> vIsOpen = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public record CellValue(XSSDataType dataType, String strValue, String format) {
|
||||
public boolean getBooleanValue() {
|
||||
return strValue.charAt(0) == '1';
|
||||
}
|
||||
|
||||
public double getNumberValue() {
|
||||
return Double.parseDouble(strValue);
|
||||
}
|
||||
|
||||
public long getIntegerValue() {
|
||||
return Long.parseLong(strValue);
|
||||
}
|
||||
|
||||
public Temporal getDateValue(boolean use1904Dates) {
|
||||
return use1904Dates
|
||||
? ExcelUtils.fromExcelDateTime1904(getIntegerValue())
|
||||
: ExcelUtils.fromExcelDateTime(getIntegerValue());
|
||||
}
|
||||
|
||||
public Temporal getDateTimeValue(boolean use1904Dates) {
|
||||
if (use1904Dates) {
|
||||
var datetime = ExcelUtils.fromExcelDateTime1904(getNumberValue());
|
||||
if (datetime instanceof ZonedDateTime zdt
|
||||
&& zdt.getYear() == 1904
|
||||
&& zdt.getDayOfYear() == 1
|
||||
&& !format.contains("y")
|
||||
&& !format.contains("M")
|
||||
&& !format.contains("d")) {
|
||||
datetime = zdt.toLocalTime();
|
||||
}
|
||||
return datetime;
|
||||
}
|
||||
|
||||
return ExcelUtils.fromExcelDateTime(getNumberValue());
|
||||
}
|
||||
}
|
||||
|
||||
public String getStringValue() {
|
||||
if (dataType == XSSDataType.SST_STRING) {
|
||||
return getSharedString(value.toString());
|
||||
} else if (dataType == XSSDataType.INLINE_STRING) {
|
||||
return new XSSFRichTextString(value.toString()).toString();
|
||||
}
|
||||
return value.toString();
|
||||
}
|
||||
|
||||
private String getSharedString(String value) {
|
||||
int idx = Integer.parseInt(value);
|
||||
var ss = sharedStrings.getItemAt(idx);
|
||||
return ss == null ? null : ss.toString();
|
||||
}
|
||||
|
||||
private void outputCellValue() {
|
||||
short columnNumber = 0;
|
||||
int i = 0;
|
||||
char c;
|
||||
while (i < cellRef.length() && (c = cellRef.charAt(i)) >= 'A' && c <= 'Z') {
|
||||
columnNumber = (short) (columnNumber * 26 + (c - 'A' + 1));
|
||||
i++;
|
||||
}
|
||||
|
||||
if (!seenValue) {
|
||||
onCell(rowNumber, columnNumber, cellRef, new CellValue(XSSDataType.BLANK, "", null));
|
||||
return;
|
||||
}
|
||||
|
||||
var stringValue = getStringValue();
|
||||
if (dataType == XSSDataType.NUMBER) {
|
||||
boolean isInteger = !stringValue.contains(".");
|
||||
boolean isDate = DateUtil.isADateFormat(-1, numberFormat);
|
||||
if (isInteger && isDate) {
|
||||
dataType = XSSDataType.OLE_DATE;
|
||||
} else if (isInteger) {
|
||||
dataType = XSSDataType.INTEGER;
|
||||
} else if (isDate) {
|
||||
dataType = XSSDataType.OLE_DATETIME;
|
||||
}
|
||||
}
|
||||
|
||||
var cellValue = new CellValue(dataType, stringValue, numberFormat);
|
||||
onCell(rowNumber, columnNumber, cellRef, cellValue);
|
||||
}
|
||||
|
||||
protected void onDimensions(String dimension) {}
|
||||
|
||||
protected void onStartRow(int rowNumber) {}
|
||||
|
||||
protected void onCell(int rowNumber, short columnNumber, String ref, CellValue cellValue) {}
|
||||
|
||||
protected void onSheetEnd() {}
|
||||
}
|
@ -0,0 +1,284 @@
|
||||
package org.enso.table.excel.xssfreader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Consumer;
|
||||
import javax.xml.XMLConstants;
|
||||
import javax.xml.namespace.NamespaceContext;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpression;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
import javax.xml.xpath.XPathFactory;
|
||||
import org.apache.poi.ooxml.util.DocumentHelper;
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackageAccess;
|
||||
import org.apache.poi.ss.usermodel.RichTextString;
|
||||
import org.apache.poi.xssf.eventusermodel.XSSFReader;
|
||||
import org.apache.poi.xssf.model.SharedStrings;
|
||||
import org.apache.poi.xssf.usermodel.XSSFRelation;
|
||||
import org.enso.table.excel.ExcelSheet;
|
||||
import org.enso.table.excel.ExcelWorkbook;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
public class XSSFReaderWorkbook implements ExcelWorkbook {
|
||||
private static final XPathFactory xpathFactory = XPathFactory.newInstance();
|
||||
private static final NamespaceContext namespaceContext = new SpreadsheetContext();
|
||||
private static final Map<String, XPathExpression> xpathCache = new HashMap<>();
|
||||
|
||||
private static XPathExpression compileXPathWithNamespace(String xpath)
|
||||
throws XPathExpressionException {
|
||||
if (!xpathCache.containsKey(xpath)) {
|
||||
var newXPath = xpathFactory.newXPath();
|
||||
newXPath.setNamespaceContext(namespaceContext);
|
||||
var compiled = newXPath.compile(xpath);
|
||||
xpathCache.put(xpath, compiled);
|
||||
}
|
||||
return xpathCache.get(xpath);
|
||||
}
|
||||
|
||||
private static class SpreadsheetContext implements NamespaceContext {
|
||||
@Override
|
||||
public String getNamespaceURI(String prefix) {
|
||||
if (prefix == null) {
|
||||
throw new IllegalArgumentException("prefix cannot be null");
|
||||
}
|
||||
return prefix.equals("ss") ? XSSFRelation.NS_SPREADSHEETML : XMLConstants.NULL_NS_URI;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPrefix(String namespaceURI) {
|
||||
if (namespaceURI == null) {
|
||||
throw new IllegalArgumentException("namespaceURI cannot be null");
|
||||
}
|
||||
return namespaceURI.equals(XSSFRelation.NS_SPREADSHEETML) ? "ss" : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> getPrefixes(String namespaceURI) {
|
||||
if (namespaceURI == null) {
|
||||
throw new IllegalArgumentException("namespaceURI cannot be null");
|
||||
}
|
||||
return namespaceURI.equals(XSSFRelation.NS_SPREADSHEETML)
|
||||
? Collections.singleton("ss").iterator()
|
||||
: Arrays.stream(new String[0]).iterator();
|
||||
}
|
||||
}
|
||||
|
||||
public static final String WORKBOOK_CONFIG_XPATH = "/ss:workbook/ss:workbookPr";
|
||||
public static final String SHEET_NAME_XPATH = "/ss:workbook/ss:sheets/ss:sheet";
|
||||
public static final String NAMED_RANGE_XPATH = "/ss:workbook/ss:definedNames/ss:definedName";
|
||||
|
||||
private final String path;
|
||||
|
||||
private boolean use1904DateSystemFlag = false;
|
||||
private List<SheetInfo> sheetInfos;
|
||||
private Map<String, SheetInfo> sheetInfoMap;
|
||||
private Map<String, NamedRange> namedRangeMap;
|
||||
|
||||
private boolean hasReadShared = false;
|
||||
private SharedStrings sharedStrings;
|
||||
private XSSFReaderFormats styles;
|
||||
|
||||
public XSSFReaderWorkbook(String path) throws IOException {
|
||||
this.path = path;
|
||||
|
||||
// Read the workbook data
|
||||
this.readWorkbookData();
|
||||
}
|
||||
|
||||
public String getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
void withReader(Consumer<XSSFReader> action) throws IOException {
|
||||
try (var pkg = OPCPackage.open(path, PackageAccess.READ)) {
|
||||
var reader = new XSSFReader(pkg);
|
||||
action.accept(reader);
|
||||
} catch (OpenXML4JException e) {
|
||||
throw new IOException(
|
||||
"Invalid format encountered when opening the file " + path + " as XLSX.", e);
|
||||
}
|
||||
}
|
||||
|
||||
private record SheetInfo(int index, int sheetId, String name, String relID, boolean visible) {}
|
||||
|
||||
private record NamedRange(String name, String formula) {}
|
||||
|
||||
private void readWorkbookData() throws IOException {
|
||||
withReader(
|
||||
reader -> {
|
||||
try {
|
||||
var workbookData = reader.getWorkbookData();
|
||||
var workbookDoc = DocumentHelper.readDocument(workbookData);
|
||||
read1904DateSetting(workbookDoc);
|
||||
readSheetInfo(workbookDoc);
|
||||
readNamedRanges(workbookDoc);
|
||||
} catch (SAXException
|
||||
| IOException
|
||||
| InvalidFormatException
|
||||
| XPathExpressionException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private void readNamedRanges(Document workbookDoc) throws XPathExpressionException {
|
||||
var namesXPath = compileXPathWithNamespace(NAMED_RANGE_XPATH);
|
||||
var nameNodes = (NodeList) namesXPath.evaluate(workbookDoc, XPathConstants.NODESET);
|
||||
namedRangeMap = new HashMap<>();
|
||||
for (int i = 0; i < nameNodes.getLength(); i++) {
|
||||
var node = nameNodes.item(i);
|
||||
var name = node.getAttributes().getNamedItem("name").getNodeValue();
|
||||
var formula = node.getTextContent();
|
||||
namedRangeMap.put(name, new NamedRange(name, formula));
|
||||
}
|
||||
}
|
||||
|
||||
private void readSheetInfo(Document workbookDoc) throws XPathExpressionException {
|
||||
var sheetXPath = compileXPathWithNamespace(SHEET_NAME_XPATH);
|
||||
var sheetNodes = (NodeList) sheetXPath.evaluate(workbookDoc, XPathConstants.NODESET);
|
||||
sheetInfos = new ArrayList<>(sheetNodes.getLength());
|
||||
sheetInfoMap = new HashMap<>();
|
||||
for (int i = 0; i < sheetNodes.getLength(); i++) {
|
||||
var node = sheetNodes.item(i);
|
||||
var sheetName = node.getAttributes().getNamedItem("name").getNodeValue();
|
||||
var sheetId = Integer.parseInt(node.getAttributes().getNamedItem("sheetId").getNodeValue());
|
||||
var relId = node.getAttributes().getNamedItem("r:id").getNodeValue();
|
||||
var visible = node.getAttributes().getNamedItem("state") == null;
|
||||
var sheetInfo = new SheetInfo(i, sheetId, sheetName, relId, visible);
|
||||
sheetInfos.add(sheetInfo);
|
||||
sheetInfoMap.put(sheetName, sheetInfo);
|
||||
}
|
||||
}
|
||||
|
||||
private void read1904DateSetting(Document workbookDoc) throws XPathExpressionException {
|
||||
var workbookXPath = compileXPathWithNamespace(WORKBOOK_CONFIG_XPATH);
|
||||
var workbookNode = (Node) workbookXPath.evaluate(workbookDoc, XPathConstants.NODE);
|
||||
if (workbookNode != null) {
|
||||
var date1904 = workbookNode.getAttributes().getNamedItem("date1904");
|
||||
use1904DateSystemFlag = date1904 != null && "1".equals(date1904.getNodeValue());
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void ensureReadShared() {
|
||||
if (hasReadShared) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
withReader(
|
||||
reader -> {
|
||||
try {
|
||||
reader.setUseReadOnlySharedStringsTable(true);
|
||||
sharedStrings = reader.getSharedStringsTable();
|
||||
if (sharedStrings == null) {
|
||||
sharedStrings =
|
||||
new SharedStrings() {
|
||||
@Override
|
||||
public RichTextString getItemAt(int idx) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCount() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getUniqueCount() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Read the styles table and attach the format data
|
||||
var stylesTable = reader.getStylesTable();
|
||||
styles = new XSSFReaderFormats(stylesTable);
|
||||
|
||||
hasReadShared = true;
|
||||
} catch (InvalidFormatException | IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Flag that workbook is in 1904 format. */
|
||||
boolean use1904Format() {
|
||||
return use1904DateSystemFlag;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumberOfSheets() {
|
||||
return sheetInfoMap.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getSheetIndex(String name) {
|
||||
if (!sheetInfoMap.containsKey(name)) {
|
||||
return -1;
|
||||
}
|
||||
return sheetInfoMap.get(name).index;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSheetName(int sheet) {
|
||||
if (sheet < 0 || sheet >= sheetInfos.size()) {
|
||||
throw new IllegalArgumentException("Sheet index out of range: " + sheet);
|
||||
}
|
||||
return sheetInfos.get(sheet).name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumberOfNames() {
|
||||
return namedRangeMap.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] getRangeNames() {
|
||||
return namedRangeMap.keySet().toArray(String[]::new);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getNameFormula(String name) {
|
||||
var namedRange = namedRangeMap.get(name);
|
||||
return namedRange == null ? null : namedRange.formula;
|
||||
}
|
||||
|
||||
public SharedStrings getSharedStrings() {
|
||||
ensureReadShared();
|
||||
return sharedStrings;
|
||||
}
|
||||
|
||||
public XSSFReaderFormats getStyles() {
|
||||
ensureReadShared();
|
||||
return styles;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExcelSheet getSheetAt(int sheetIndex) {
|
||||
if (sheetIndex < 0 || sheetIndex >= sheetInfos.size()) {
|
||||
throw new IllegalArgumentException("Sheet index out of range: " + sheetIndex);
|
||||
}
|
||||
var sheetInfo = sheetInfos.get(sheetIndex);
|
||||
return new XSSFReaderSheet(sheetIndex, sheetInfo.name, sheetInfo.relID, this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
// Nothing to do
|
||||
}
|
||||
}
|
@ -7,9 +7,6 @@ import java.util.List;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
import org.apache.poi.ss.usermodel.Name;
|
||||
import org.apache.poi.ss.usermodel.Workbook;
|
||||
import org.apache.poi.ss.util.CellReference;
|
||||
import org.enso.table.data.column.builder.Builder;
|
||||
import org.enso.table.data.column.builder.InferredBuilder;
|
||||
@ -24,6 +21,7 @@ import org.enso.table.excel.ExcelHeaders;
|
||||
import org.enso.table.excel.ExcelRange;
|
||||
import org.enso.table.excel.ExcelRow;
|
||||
import org.enso.table.excel.ExcelSheet;
|
||||
import org.enso.table.excel.ExcelWorkbook;
|
||||
import org.enso.table.excel.ReadOnlyExcelConnection;
|
||||
import org.enso.table.problems.ProblemAggregator;
|
||||
import org.graalvm.polyglot.Context;
|
||||
@ -38,18 +36,17 @@ public class ExcelReader {
|
||||
* @return a String[] containing the sheet names.
|
||||
* @throws IOException when the action fails
|
||||
*/
|
||||
public static String[] readSheetNames(File file, ExcelFileFormat format)
|
||||
throws IOException, InvalidFormatException {
|
||||
public static String[] readSheetNames(File file, ExcelFileFormat format) throws IOException {
|
||||
return withWorkbook(file, format, ExcelReader::readSheetNames);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a list of sheet names from a workbook into an array.
|
||||
*
|
||||
* @param workbook a {@link Workbook} to read the sheet names from.
|
||||
* @param workbook a {@link ExcelWorkbook} to read the sheet names from.
|
||||
* @return a String[] containing the sheet names.
|
||||
*/
|
||||
public static String[] readSheetNames(Workbook workbook) {
|
||||
public static String[] readSheetNames(ExcelWorkbook workbook) {
|
||||
int sheetCount = workbook.getNumberOfSheets();
|
||||
var output = new String[sheetCount];
|
||||
Context context = Context.getCurrent();
|
||||
@ -68,20 +65,8 @@ public class ExcelReader {
|
||||
* @return a String[] containing the range names.
|
||||
* @throws IOException when the action fails
|
||||
*/
|
||||
public static String[] readRangeNames(File file, ExcelFileFormat format)
|
||||
throws IOException, InvalidFormatException {
|
||||
return withWorkbook(file, format, ExcelReader::readRangeNames);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a list of range names for the specified XLSX/XLS file into an array.
|
||||
*
|
||||
* @param workbook a {@link Workbook} to read the sheet names from.
|
||||
* @return a String[] containing the range names.
|
||||
*/
|
||||
public static String[] readRangeNames(Workbook workbook) {
|
||||
var names = workbook.getAllNames();
|
||||
return names.stream().map(Name::getNameName).toArray(String[]::new);
|
||||
public static String[] readRangeNames(File file, ExcelFileFormat format) throws IOException {
|
||||
return withWorkbook(file, format, ExcelWorkbook::getRangeNames);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -202,7 +187,7 @@ public class ExcelReader {
|
||||
/**
|
||||
* Reads a range by sheet name, named range or address for the workbook into a table.
|
||||
*
|
||||
* @param workbook a {@link Workbook} to read from.
|
||||
* @param workbook a {@link ExcelWorkbook} to read from.
|
||||
* @param rangeNameOrAddress sheet name, range name or address to read.
|
||||
* @param headers specifies whether the first row should be used as headers.
|
||||
* @param skip_rows skip rows from the top of the range.
|
||||
@ -211,7 +196,7 @@ public class ExcelReader {
|
||||
* @throws InvalidLocationException when the range name or address is not found.
|
||||
*/
|
||||
public static Table readRangeByName(
|
||||
Workbook workbook,
|
||||
ExcelWorkbook workbook,
|
||||
String rangeNameOrAddress,
|
||||
ExcelHeaders.HeaderBehavior headers,
|
||||
int skip_rows,
|
||||
@ -230,11 +215,10 @@ public class ExcelReader {
|
||||
problemAggregator);
|
||||
}
|
||||
|
||||
Name name = workbook.getName(rangeNameOrAddress);
|
||||
|
||||
ExcelRange excelRange;
|
||||
try {
|
||||
excelRange = new ExcelRange(name == null ? rangeNameOrAddress : name.getRefersToFormula());
|
||||
var formula = workbook.getNameFormula(rangeNameOrAddress);
|
||||
excelRange = new ExcelRange(formula == null ? rangeNameOrAddress : formula);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new InvalidLocationException(
|
||||
rangeNameOrAddress,
|
||||
@ -271,8 +255,8 @@ public class ExcelReader {
|
||||
readRange(workbook, excelRange, headers, skip_rows, row_limit, problemAggregator));
|
||||
}
|
||||
|
||||
private static <T> T withWorkbook(File file, ExcelFileFormat format, Function<Workbook, T> action)
|
||||
throws IOException {
|
||||
private static <T> T withWorkbook(
|
||||
File file, ExcelFileFormat format, Function<ExcelWorkbook, T> action) throws IOException {
|
||||
try (ReadOnlyExcelConnection connection =
|
||||
ExcelConnectionPool.INSTANCE.openReadOnlyConnection(file, format)) {
|
||||
return connection.withWorkbook(action);
|
||||
@ -280,7 +264,7 @@ public class ExcelReader {
|
||||
}
|
||||
|
||||
public static Table readRange(
|
||||
Workbook workbook,
|
||||
ExcelWorkbook workbook,
|
||||
ExcelRange excelRange,
|
||||
ExcelHeaders.HeaderBehavior headers,
|
||||
int skip_rows,
|
||||
@ -304,7 +288,7 @@ public class ExcelReader {
|
||||
}
|
||||
|
||||
private static Table readTable(
|
||||
Workbook workbook,
|
||||
ExcelWorkbook workbook,
|
||||
int sheetIndex,
|
||||
ExcelRange excelRange,
|
||||
ExcelHeaders.HeaderBehavior headers,
|
||||
@ -312,7 +296,7 @@ public class ExcelReader {
|
||||
int rowCount,
|
||||
ProblemAggregator problemAggregator) {
|
||||
|
||||
ExcelSheet sheet = new ExcelSheet(workbook, sheetIndex);
|
||||
ExcelSheet sheet = workbook.getSheetAt(sheetIndex);
|
||||
|
||||
// Expand Single Cell
|
||||
if (excelRange != null && excelRange.isSingleCell()) {
|
||||
|
@ -77,7 +77,8 @@ public class ExcelWriter {
|
||||
headers =
|
||||
headers != ExcelHeaders.HeaderBehavior.INFER
|
||||
? headers
|
||||
: shouldWriteHeaders(new ExcelSheet(workbook, sheetIndex), firstRow + 1, 1, -1);
|
||||
: shouldWriteHeaders(
|
||||
ExcelSheet.forPOIUserModel(workbook, sheetIndex), firstRow + 1, 1, -1);
|
||||
|
||||
String sheetName = workbook.getSheetName(sheetIndex - 1);
|
||||
workbook.removeSheetAt(sheetIndex - 1);
|
||||
@ -130,7 +131,8 @@ public class ExcelWriter {
|
||||
headers =
|
||||
headers != ExcelHeaders.HeaderBehavior.INFER
|
||||
? headers
|
||||
: shouldWriteHeaders(new ExcelSheet(workbook, sheetIndex), firstRow + 1, 1, -1);
|
||||
: shouldWriteHeaders(
|
||||
ExcelSheet.forPOIUserModel(workbook, sheetIndex), firstRow + 1, 1, -1);
|
||||
|
||||
workbook.removeSheetAt(sheetIndex);
|
||||
Sheet sheet = workbook.createSheet(sheetName);
|
||||
@ -198,7 +200,7 @@ public class ExcelWriter {
|
||||
throw new InvalidLocationException(
|
||||
range.getSheetName(), "Unknown sheet '" + range.getSheetName() + "'.");
|
||||
}
|
||||
ExcelSheet sheet = new ExcelSheet(workbook, sheetIndex);
|
||||
ExcelSheet sheet = ExcelSheet.forPOIUserModel(workbook, sheetIndex);
|
||||
|
||||
if (skipRows != 0) {
|
||||
if (range.isWholeColumn()) {
|
||||
|
Loading…
Reference in New Issue
Block a user