Initial implementation of Data.read_many (#11490)

- Part of #11311
- Adds ability to read a list of files (Vector, Column, Table) into a Vector.
- Reading into a Table of objects or merged will come in a next PR.
This commit is contained in:
Radosław Waśko 2024-11-08 20:03:47 +01:00 committed by GitHub
parent 67db825587
commit e76fe907d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 392 additions and 3 deletions

View File

@ -49,11 +49,14 @@
programmatically.][11255]
- [DB_Table may be saved as a Data Link.][11371]
- [Support for dates before 1900 in Excel and signed AWS requests.][11373]
- [Added `Data.read_many` that allows to read a list of files in a single
operation.][11490]
[11235]: https://github.com/enso-org/enso/pull/11235
[11255]: https://github.com/enso-org/enso/pull/11255
[11371]: https://github.com/enso-org/enso/pull/11371
[11373]: https://github.com/enso-org/enso/pull/11373
[11490]: https://github.com/enso-org/enso/pull/11490
#### Enso Language & Runtime

View File

@ -1,5 +1,7 @@
import project.Any.Any
import project.Data.Pair.Pair
import project.Data.Read.Many_Files_List.Many_Files_List
import project.Data.Read.Return_As.Return_As
import project.Data.Text.Encoding.Encoding
import project.Data.Text.Text
import project.Data.Vector.Vector
@ -27,7 +29,7 @@ import project.System.File.Generic.Writable_File.Writable_File
from project.Data.Boolean import Boolean, False, True
from project.Meta.Enso_Project import enso_project
from project.Metadata.Choice import Option
from project.Metadata.Widget import Folder_Browse, Text_Input
from project.Metadata.Widget import Folder_Browse, Text_Input, Vector_Editor
from project.System.File_Format import Auto_Detect, File_Format
## ALIAS load, open
@ -92,6 +94,59 @@ read path=(Missing_Argument.throw "path") format=Auto_Detect (on_problems : Prob
if file_obj.is_directory then Error.throw (Illegal_Argument.Error "Cannot `read` a directory, use `Data.list`.") else
file_obj.read format on_problems
## ALIAS load, open
GROUP Input
ICON data_input
Reads a a list of files into Enso.
Arguments:
- paths: A list of files to load. It can be a Vector, Column or Table of
files, paths or URIs to fetch. If a Table is provided, it must either
contain a single column or a column called `path` (case insensitive).
- format: A `File_Format` object used to read files into memory.
If `Auto_Detect` is specified; each file determines the specific
type and configures it appropriately. If there is no matching type then
a `File_Error.Unsupported_Type` error is returned.
- return: Specifies the shape of the data to return.
- on_problems: Specifies the behavior when a problem occurs during the
function.
By default, if one of the files fails to load, a warning is issued and the
entry for that file becomes `Nothing`, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error on the
first failing file.
If set to `Ignore`, the operation proceeds without errors or warnings,
replacing files that fail to load with `Nothing`.
! Request Caching
Responses to HTTP data requests are cached, and additional requests for the
same resources will use the cache, saving a round-trip call to the remote
server. Two resources are considered the same if the URIs and request
headers are the same. Header order does not affect sameness.
The cache respects the "max-age" and "Age" response headers; see
`Data.fetch` for more details.
The cached values are retained as long as the project remains open. Closing
a project will clear the cache.
> Example
Read all CSV files from a directory into a single merged table.
from Standard.Table import all
import Standard.Examples
files = Data.list name_filter="*.csv"
example_csv_dir_to_table = Data.read_many files
@paths (Vector_Editor item_editor=Text_Input item_default='""')
@format File_Format.default_widget
read_many : Many_Files_List -> File_Format -> Return_As -> Problem_Behavior -> Any ! File_Error
read_many (paths : Many_Files_List = Missing_Argument.throw "paths") format=Auto_Detect return=..Vector (on_problems : Problem_Behavior = ..Report_Warning) =
return_as = Return_As.resolve return
loaded_objects = paths.paths_to_load.map on_problems=on_problems path->
Data.read path format on_problems
return_as.make_return paths loaded_objects
## ALIAS load text, open text
GROUP Input
ICON data_input

View File

@ -0,0 +1,22 @@
import project.Data.Text.Text
import project.Data.Vector.Vector
## A common interface that represents a list of files that can be read.
Various types (e.g. Vector, Column) can convert to this type to be able to be
used in `Data.read_many`.
type Many_Files_List
## PRIVATE
Value original_value paths_to_load:Vector
## PRIVATE
to_text self -> Text =
"Many_Files_List "+self.original_value.to_text
## PRIVATE
to_display_text self -> Text =
"Many_Files_List "+self.original_value.to_display_text
## PRIVATE
Many_Files_List.from (that : Vector) =
Many_Files_List.Value that that

View File

@ -0,0 +1,78 @@
import project.Any.Any
import project.Data.Text.Text
import project.Data.Read.Many_Files_List.Many_Files_List
import project.Data.Vector.Vector
import project.Error.Error
import project.Errors.Common.Type_Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Function.Function
import project.Metadata.Display
import project.Metadata.Widget
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, False, True
from project.Metadata.Choice import Option
from project.Metadata.Widget import Single_Choice
polyglot java import org.enso.base.read.ReadManyReturnSPI
private _get_known_return_classes -> Vector =
Vector.from_polyglot_array (ReadManyReturnSPI.get_types False)
## A common interface that represents ways to return a list of files that have
been read.
type Return_As
## PRIVATE
Instance underlying
## PRIVATE
to_text self -> Text = self.underlying.to_text
## PRIVATE
to_display_text self -> Text = self.underlying.to_display_text
## PRIVATE
make_return self (input : Many_Files_List) (objects : Vector Any) =
self.underlying.make_return input objects
## PRIVATE
Resolve an unresolved constructor to the actual type.
private resolve value = case value of
_ : Function ->
types = _get_known_return_classes
try_next idx =
if idx >= types.length then Error.throw (Illegal_Argument.Error "Expected Return_As, but got a function.") else
resolved = (types.at idx).resolve value
if resolved.is_nothing then @Tail_Call try_next (idx + 1) else resolved
try_next 0
_ : Return_As -> value
_ -> Panic.throw (Type_Error.Error Return_As value "Expected `return` to be a Return_As type, but got {got}.")
## PRIVATE
default_widget : Widget
default_widget =
options = _get_known_return_classes.map .get_dropdown_options
Single_Choice display=Display.Always values=options
## PRIVATE
type Return_As_Base
## Will return a Vector of objects that were loaded.
The order of the returned Vector is the same as in the input.
Vector
## PRIVATE
get_dropdown_options : Vector Option
get_dropdown_options = [Option "Vector" "..Vector"]
## PRIVATE
resolve value =
Panic.catch Type_Error (value:Return_As_Base) _->Nothing
## PRIVATE
make_return self (input : Many_Files_List) (objects : Vector Any) =
_ = input
objects
## PRIVATE
Return_As.from (that : Return_As_Base) =
Return_As.Instance that

View File

@ -1,4 +1,5 @@
from Standard.Base import all
import Standard.Base.Data.Read.Many_Files_List.Many_Files_List
import Standard.Base.Errors.Common.Index_Out_Of_Bounds
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.Errors.Illegal_State.Illegal_State
@ -2168,3 +2169,8 @@ Vector.from (that:DB_Column) =
## PRIVATE
Cleansable_Text.from (that:DB_Column) = Cleansable_Text.Value (pattern->replace_with-> (that.text_replace (regex pattern) replace_with).rename that.name)
## PRIVATE
Many_Files_List.from (that : DB_Column) =
_ = that
Error.throw (Illegal_Argument.Error "`read_many` cannot be used with Database columns. Materialize the column into memory using `.read` first.")

View File

@ -1,6 +1,7 @@
from Standard.Base import all
import Standard.Base.Data.Array_Proxy.Array_Proxy
import Standard.Base.Data.Filter_Condition as Filter_Condition_Module
import Standard.Base.Data.Read.Many_Files_List.Many_Files_List
import Standard.Base.Data.Time.Errors.Date_Time_Format_Parse_Error
import Standard.Base.Data.Vector.Builder
import Standard.Base.Errors.Common.Additional_Warnings
@ -3120,3 +3121,8 @@ make_literal_table connection column_vectors column_names alias =
connection.dialect.make_cast base_column sql_type infer_type_from_database
DB_Table.Value alias connection internal_columns context
## PRIVATE
Many_Files_List.from (that : DB_Table) =
_ = that
Error.throw (Illegal_Argument.Error "`read_many` cannot be used with Database tables. Materialize the table into memory using `.read` first.")

View File

@ -19,4 +19,3 @@ export project.Extensions.Upload_In_Memory_Table.update_rows
export project.SQL_Query.SQL_Query
export project.Update_Action.Update_Action

View File

@ -1,5 +1,6 @@
from Standard.Base import all
import Standard.Base.Data.Array_Proxy.Array_Proxy
import Standard.Base.Data.Read.Many_Files_List.Many_Files_List
import Standard.Base.Data.Vector.No_Wrap
import Standard.Base.Errors.Common.Arithmetic_Error
import Standard.Base.Errors.Common.Incomparable_Values
@ -22,6 +23,7 @@ import project.Internal.Column_Ops
import project.Internal.Date_Time_Helpers
import project.Internal.Java_Problems
import project.Internal.Parse_Values_Helper
import project.Internal.Read_Many_Helpers
import project.Internal.Storage
import project.Internal.Value_Type_Helpers
import project.Internal.Widget_Helpers
@ -2927,3 +2929,8 @@ apply_unary_map column:Column new_name:Text function expected_result_type:Value_
Java_Problems.with_map_operation_problem_aggregator column.name Problem_Behavior.Report_Warning java_problem_aggregator->
map_column = UnaryOperation.mapFunction column.java_column function nothing_unchanged storage_type new_name java_problem_aggregator
Column.Value map_column
## PRIVATE
Many_Files_List.from (that : Column) =
Read_Many_Helpers.ensure_column_type_valid_to_be_files_list that <|
Many_Files_List.Value that that.to_vector

View File

@ -0,0 +1,30 @@
private
from Standard.Base import all
import Standard.Base.Data.Read.Many_Files_List.Many_Files_List
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import project.Column.Column
import project.Errors.Invalid_Value_Type
import project.Table.Table
import project.Value_Type.Value_Type
find_files_list_in_table (that : Table) -> Many_Files_List =
found_column = if that.column_count == 1 then that.at 0 else
path_columns = that.select_columns "path" case_sensitivity=..Insensitive on_problems=..Report_Error
not_found = path_columns.is_error || (path_columns.column_count == 0)
if not_found then Error.throw (Illegal_Argument.Error "To use a Table as file list, it must be a single column or contain a `path` column (case insensitive).") else
if path_columns.column_count > 1 then Error.throw (Illegal_Argument.Error "Multiple 'paths' column candidates found: "+path_columns.column_names.to_display_text+".") else
path_columns.at 0
ensure_column_type_valid_to_be_files_list found_column <|
Many_Files_List.Value that found_column.to_vector
ensure_column_type_valid_to_be_files_list (column : Column) ~action =
is_expected_type = case column.value_type of
# Columns containing File objects will be Mixed
Value_Type.Mixed -> True
# Columns containing paths as Text will be Char
Value_Type.Char _ _ -> True
_ -> False
if is_expected_type then action else
Error.throw (Invalid_Value_Type.Column "Text or Mixed" column.value_type column.name)

View File

@ -55,4 +55,3 @@ export project.Table.Table
export project.Value_Type.Auto
export project.Value_Type.Bits
export project.Value_Type.Value_Type

View File

@ -1,6 +1,7 @@
from Standard.Base import all
import Standard.Base.Data.Array_Proxy.Array_Proxy
import Standard.Base.Data.Filter_Condition as Filter_Condition_Module
import Standard.Base.Data.Read.Many_Files_List.Many_Files_List
import Standard.Base.Data.Time.Errors.Date_Time_Format_Parse_Error
import Standard.Base.Data.Vector.No_Wrap
import Standard.Base.Errors.Common.Additional_Warnings
@ -47,6 +48,7 @@ import project.Internal.Lookup_Helpers
import project.Internal.Lookup_Helpers.Lookup_Column
import project.Internal.Parse_Values_Helper
import project.Internal.Problem_Builder.Problem_Builder
import project.Internal.Read_Many_Helpers
import project.Internal.Replace_Helpers
import project.Internal.Split_Tokenize
import project.Internal.Table_Helpers
@ -3881,3 +3883,7 @@ make_fill_nothing_default_widget table cache=Nothing =
## PRIVATE
Helper method for internal use to make a Table from a Java Table.
from_java_table java_table = Table.Value java_table
## PRIVATE
Many_Files_List.from (that : Table) =
Read_Many_Helpers.find_files_list_in_table that

View File

@ -0,0 +1,14 @@
package org.enso.base.read;
@org.openide.util.lookup.ServiceProvider(service = ReadManyReturnSPI.class)
public class BaseReadManyReturnSPI extends ReadManyReturnSPI {
@Override
protected String getModuleName() {
return "Standard.Base.Data.Read.Return_As";
}
@Override
protected String getTypeName() {
return "Return_As_Base";
}
}

View File

@ -0,0 +1,25 @@
package org.enso.base.read;
import java.util.ServiceLoader;
import org.enso.base.polyglot.EnsoMeta;
import org.graalvm.polyglot.Value;
public abstract class ReadManyReturnSPI {
private static final ServiceLoader<ReadManyReturnSPI> loader =
ServiceLoader.load(ReadManyReturnSPI.class, ReadManyReturnSPI.class.getClassLoader());
public static Value[] get_types(boolean refresh) {
if (refresh) {
loader.reload();
}
return loader.stream().map(provider -> provider.get().getTypeObject()).toArray(Value[]::new);
}
public Value getTypeObject() {
return EnsoMeta.getType(getModuleName(), getTypeName());
}
protected abstract String getModuleName();
protected abstract String getTypeName();
}

View File

@ -200,6 +200,12 @@ add_specs suite_builder =
r = Data.read (URI.from url_get)
r.should_be_a JS_Object
group_builder.specify "can use URI or Text URLs in Data.read_many" <|
r = Data.read_many [URI.from url_get, url_get]
r.should_be_a Vector
r.at 0 . should_be_a JS_Object
r.at 1 . should_be_a JS_Object
group_builder.specify "works if HTTP is uppercase" <| Test.with_retries <|
r = Data.fetch (url_get.replace "http" "HTTP")
r.should_be_a JS_Object

View File

@ -1,9 +1,12 @@
from Standard.Base import all
import Standard.Base.Data.Vector.Map_Error
import Standard.Base.Errors.Encoding_Error.Encoding_Error
import Standard.Base.Errors.File_Error.File_Error
from Standard.Test import all
type Lazy_Ref
Value ~get
add_specs suite_builder =
sample_xxx = enso_project.data / "sample.xxx"
@ -85,6 +88,48 @@ add_specs suite_builder =
patterns.should_contain "*.txt"
patterns.should_contain "*.json"
suite_builder.group "Data.read_many" group_builder->
js_object = JS_Object.from_pairs [["arr", [1, 2, 3]], ["num", 42.5], ["not", Nothing]]
js_as_text = Lazy_Ref.Value <|
(enso_project.data / "sample.json") . read ..Plain_Text
group_builder.specify "should allow to read a list of files and return them as vector" <|
files = [enso_project.data / "sample.json", enso_project.data / "helloworld.txt"]
# Read all files using Auto_Detect - each file is read according to its inferred format.
r1 = Data.read_many files
r1.should_equal [js_object, "Hello World!"]
# Read all files using a specified format.
r2 = Data.read_many files format=..Plain_Text
r2.should_be_a Vector
r2.should_equal [js_as_text.get, "Hello World!"]
group_builder.specify "should work with paths as Text" <|
files = [enso_project.data / "sample.json", enso_project.data / "helloworld.txt"]
paths = files.map .path
r1 = Data.read_many paths return=..Vector
r1.should_equal [js_object, "Hello World!"]
three_files = [enso_project.data / "sample.json", enso_project.data / "nonexistent.txt", enso_project.data / "helloworld.txt"]
group_builder.specify "should allow to Report_Error if any file fails to load" <|
r1 = Data.read_many three_files return=..Vector on_problems=..Report_Error
# The error reports as File_Error
r1.should_fail_with File_Error
# But it's actually Map_Error with index metadata
r1.should_fail_with unwrap_errors=False Map_Error
r1.catch.index . should_equal 1
r1.catch.inner_error.should_be_a File_Error.Not_Found
group_builder.specify "should allow to Ignore errors if any file fails to load" <|
r1 = Data.read_many three_files return=..Vector on_problems=..Ignore
r1.should_equal [js_object, Nothing, "Hello World!"]
Problems.assume_no_problems r1
group_builder.specify "should allow to continue loading if errors are encountered, but report them as warnings" <|
r1 = Data.read_many three_files return=..Vector on_problems=..Report_Warning
r1.should_equal [js_object, Nothing, "Hello World!"]
Problems.expect_only_warning File_Error r1
main filter=Nothing =
suite = Test.build suite_builder->
add_specs suite_builder

View File

@ -11,6 +11,7 @@ import project.IO.Excel_Spec
import project.IO.Fetch_Spec
import project.IO.Formats_Spec
import project.IO.Json_Spec
import project.IO.Read_Many_Spec
add_specs suite_builder =
Cloud_Spec.add_specs suite_builder
@ -22,6 +23,7 @@ add_specs suite_builder =
Data_Link_Formats_Spec.add_specs suite_builder
Fetch_Spec.add_specs suite_builder
Json_Spec.add_specs suite_builder
Read_Many_Spec.add_specs suite_builder
main filter=Nothing =
suite = Test.build suite_builder->

View File

@ -0,0 +1,86 @@
from Standard.Base import all
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
from Standard.Table import all
from Standard.Table.Errors import Invalid_Value_Type
from Standard.Database import all
from Standard.Test import all
from project.Util import all
main filter=Nothing =
suite = Test.build suite_builder->
add_specs suite_builder
suite.run_with_filter filter
type Lazy_Ref
Value ~get
add_specs suite_builder =
suite_builder.group "Data.read_many" group_builder->
# One File and one Text path
files_vector = [enso_project.data / "empty.txt", (enso_project.data / "sample.tsv") . path]
sample_table = Lazy_Ref.Value <|
(enso_project.data / "sample.tsv") . read
check_loaded_vector v =
v.should_be_a Vector
v.length . should_equal 2
v.at 0 . should_equal ""
v.at 1 . should_equal sample_table.get
group_builder.specify "should read files listed in a Column" <|
column = Column.from_vector "Col" files_vector
## TODO for next PR:
test that if `return` is not specified, it will return as a Table when a Column is provided
r1 = Data.read_many column return=..Vector
check_loaded_vector r1
Problems.assume_no_problems r1
group_builder.specify "should read files listed in a single column Table" <|
table1 = Table.new [["Some column", files_vector]]
r1 = Data.read_many table1 return=..Vector
# TODO like above
check_loaded_vector r1
group_builder.specify "should read files listed in a Table with `path` column" <|
table2 = Table.new [["X", [1, 2]], ["path", files_vector]]
r2 = Data.read_many table2 return=..Vector
# TODO like above
check_loaded_vector r2
Problems.assume_no_problems r2
# Test that this is really case insensitive
table3 = Table.new [["X", [1, 2]], ["pAtH", files_vector]]
r3 = Data.read_many table3 return=..Vector
check_loaded_vector r3
Problems.assume_no_problems r3
group_builder.specify "will fail if no `path` column can be found or its ambiguous" <|
table1 = Table.new [["X", [1, 2]], ["Y", files_vector]]
r1 = Data.read_many table1 return=..Vector
r1.should_fail_with Illegal_Argument
table2 = Table.new [["X", [1, 2]], ["path", files_vector], ["Path", [3, 4]]]
r2 = Data.read_many table2 return=..Vector
r2.should_fail_with Illegal_Argument
group_builder.specify "fails if a DB Table or Column is provided, telling to materialize first to in-memory" <|
connection = Database.connect SQLite.In_Memory
paths_vector = files_vector.map x-> case x of
f : File -> f.path
p : Text -> p
table = (Table.new [["path", paths_vector]]).select_into_database_table connection "test_table" temporary=True
r = Data.read_many table return=..Vector
r.should_fail_with Illegal_Argument
col = table.at "path"
r2 = Data.read_many col return=..Vector
r2.should_fail_with Illegal_Argument
group_builder.specify "fails if a column of invalid type is provided" <|
table = Table.new [["path", [1, 2]], ["X", [33, 44]]]
Data.read_many table . should_fail_with Invalid_Value_Type
Data.read_many (table.at "path") . should_fail_with Invalid_Value_Type
Data.read_many (table.select_columns ["X"]) . should_fail_with Invalid_Value_Type