Improve Non_Unique_Primary_Key error, split file format detection into read/write, improve SQLite format detection (#6604)

Closes #6437
Related to #6410

- Add example duplicate row to `Non_Unique_Primary_Key`.
- Ensure `File.read` fails if the file does not exist, always.
- Ensure SQLite fails if file is empty or nonexistent or malformed.
- Split file format detection into read and write modes, so that the read mode can depend on actual file _contents_.
This commit is contained in:
Radosław Waśko 2023-05-09 19:15:44 +02:00 committed by GitHub
parent d6f7cea923
commit d8b926922a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 211 additions and 36 deletions

View File

@ -438,6 +438,8 @@ type Array
flatten : Vector Any
flatten self = Vector.flatten self
## PRIVATE
ADVANCED
short_display_text : Integer -> Text
short_display_text self max_entries=10 = Vector.short_display_text self max_entries
@ -641,9 +643,15 @@ type Array
join : Text -> Text -> Text -> Text
join self separator="" prefix="" suffix="" = Vector.join self separator prefix suffix
## PRIVATE
Generates a human-readable text representation of the array.
to_text : Text
to_text self = self.map .to_text . join ", " "[" "]"
## PRIVATE
to_display_text : Text
to_display_text self = self.short_display_text max_entries=40
## Combines all the elements of a non-empty array using a binary operation.
If the array is empty, it returns `if_empty`.

View File

@ -618,6 +618,10 @@ type Vector a
to_text : Text
to_text self = self.map .to_text . join ", " "[" "]"
## PRIVATE
to_display_text : Text
to_display_text self = self.short_display_text max_entries=40
## PRIVATE
ADVANCED

View File

@ -245,7 +245,8 @@ type File
@format format_widget
read : File_Format -> Problem_Behavior -> Any ! File_Error
read self format=Auto_Detect (on_problems=Problem_Behavior.Report_Warning) =
format.read self on_problems
if self.exists.not then Error.throw (File_Error.Not_Found self) else
format.read self on_problems
## ALIAS Load Bytes, Open Bytes
Reads all bytes in this file into a byte vector.
@ -612,6 +613,14 @@ type File
resource = Managed_Resource.register stream close_stream
Input_Stream.Value self resource
## PRIVATE
Reads first `n` bytes from the file (or less if the file is too small)
and returns a vector of bytes.
read_first_bytes : Integer -> Vector ! File_Error
read_first_bytes self n =
opts = [File_Access.Read]
self.with_input_stream opts (_.read_n_bytes n)
## PRIVATE
Reads last `n` bytes from the file (or less if the file is too small) and
returns a vector of bytes.

View File

@ -58,14 +58,26 @@ type Auto_Detect
Implements the `File.read` for this `File_Format`
read : File -> Problem_Behavior -> Any ! File_Error
read self file on_problems =
reader = Auto_Detect.get_format file
reader = Auto_Detect.get_reading_format file
if reader == Nothing then Error.throw (File_Error.Unsupported_Type file) else
reader.read file on_problems
## PRIVATE
get_format : File -> Any | Nothing
get_format file =
get_format f-> f.for_file file
Finds a matching format for reading the file.
It assumes that `file` already exists.
get_reading_format : File -> Any | Nothing
get_reading_format file =
get_format f-> f.for_file_read file
## PRIVATE
Finds a matching format for reading the file.
It may not assume that the `file` exists, so it must only rely on the
file path (extension in particular), but not the contents.
get_writing_format : File -> Any | Nothing
get_writing_format file =
get_format f-> f.for_file_write file
## PRIVATE
get_web_parser : Text -> URI -> Any | Nothing
@ -91,13 +103,18 @@ type Plain_Text_Format
## PRIVATE
If the File_Format supports reading from the file, return a configured instance.
for_file : File -> Plain_Text_Format | Nothing
for_file file =
for_file_read : File -> Plain_Text_Format | Nothing
for_file_read file =
case file.extension of
".txt" -> Plain_Text_Format.Plain_Text
".log" -> Plain_Text_Format.Plain_Text
_ -> Nothing
## PRIVATE
If this File_Format should be used for writing to that file, return a configured instance.
for_file_write : File -> Plain_Text_Format | Nothing
for_file_write file = Plain_Text_Format.for_file_read file
## PRIVATE
If the File_Format supports reading from the web response, return a configured instance.
for_web : Text -> URI -> Plain_Text_Format | Nothing
@ -127,12 +144,17 @@ type Plain_Text_Format
type Bytes
## PRIVATE
If the File_Format supports reading from the file, return a configured instance.
for_file : File -> Bytes | Nothing
for_file file =
for_file_read : File -> Bytes | Nothing
for_file_read file =
case file.extension of
".dat" -> Bytes
_ -> Nothing
## PRIVATE
If this File_Format should be used for writing to that file, return a configured instance.
for_file_write : File -> Bytes | Nothing
for_file_write file = Bytes.for_file_read file
## PRIVATE
If the File_Format supports reading from the web response, return a configured instance.
As `Bytes`, does not support reading from the web returns `Nothing`.
@ -148,13 +170,18 @@ type Bytes
type JSON_Format
## PRIVATE
If the File_Format supports reading from the file, return a configured instance.
for_file : File -> JSON_Format | Nothing
for_file file =
for_file_read : File -> JSON_Format | Nothing
for_file_read file =
case file.extension of
".json" -> JSON_Format
".geojson" -> JSON_Format
_ -> Nothing
## PRIVATE
If this File_Format should be used for writing to that file, return a configured instance.
for_file_write : File -> JSON_Format | Nothing
for_file_write file = JSON_Format.for_file_read file
## PRIVATE
If the File_Format supports reading from the web response, return a configured instance.
for_web : Text -> URI -> JSON_Format | Nothing

View File

@ -11,8 +11,16 @@ type SQLite_Format
## PRIVATE
If the File_Format supports reading from the file, return a configured instance.
for_file : File -> SQLite_Format | Nothing
for_file file =
for_file_read : File -> SQLite_Format | Nothing
for_file_read file =
expected_header = magic_header_string
got_header = file.read_first_bytes expected_header.length
if got_header == expected_header then SQLite_Format.For_File else Nothing
## PRIVATE
If the File_Format supports writing to the file, return a configured instance.
for_file_write : File -> SQLite_Format | Nothing
for_file_write file =
case file.extension of
".db" -> SQLite_Format.For_File
".sqlite" -> SQLite_Format.For_File
@ -31,3 +39,8 @@ type SQLite_Format
read self file on_problems =
_ = [on_problems]
Database.connect (SQLite_Details.SQLite file)
## PRIVATE
Based on the File Format definition at: https://www.sqlite.org/fileformat.html
magic_header_string =
"SQLite format 3".utf_8 + [0]

View File

@ -155,10 +155,14 @@ type Non_Unique_Primary_Key
Arguments:
- primary_key: The primary key that is not unique.
Error (primary_key : Vector Text)
- clashing_primary_key: The values of an example key that corresponds to
more than one row.
- clashing_example_row_count: The number of rows that correspond to the
example key.
Error (primary_key : Vector Text) (clashing_primary_key : Vector Any) (clashing_example_row_count : Integer)
## PRIVATE
Pretty print the non-unique primary key error.
to_display_text : Text
to_display_text self =
"The primary key " + self.primary_key.to_display_text + " is not unique."
"The primary key " + self.primary_key.to_display_text + " is not unique. The key "+self.clashing_primary_key.to_display_text+" corresponds to "+self.clashing_example_row_count.to_text+" rows."

View File

@ -62,7 +62,7 @@ In_Memory_Table.create_database_table self connection table_name=Nothing primary
continue. Otherwise, they could 'leak' to `Panic.rethrow` and be wrongly
raised as panics.
upload_status = create_table_statement.if_not_error <|
translate_known_upload_errors connection resolved_primary_key <|
translate_known_upload_errors self connection resolved_primary_key <|
connection.jdbc_connection.run_within_transaction <|
Panic.rethrow <| connection.execute_update create_table_statement
if structure_only.not then
@ -119,7 +119,7 @@ Database_Table.create_database_table self connection table_name=Nothing primary_
Error.throw (Unsupported_Database_Operation.Error "The Database table to be uploaded must be coming from the same connection as the connection on which the new table is being created. Cross-connection uploads are currently not supported. To work around this, you can first `.read` the table into memory and then upload it from memory to a different connection.")
upload_status = connection_check.if_not_error <| create_table_statement.if_not_error <|
translate_known_upload_errors connection resolved_primary_key <|
translate_known_upload_errors self connection resolved_primary_key <|
connection.jdbc_connection.run_within_transaction <|
Panic.rethrow <| connection.execute_update create_table_statement
if structure_only.not then
@ -144,15 +144,35 @@ resolve_primary_key table primary_key = case primary_key of
## PRIVATE
Inspects any `SQL_Error` thrown and replaces it with a more precise error
type when available.
translate_known_upload_errors connection primary_key ~action =
translate_known_upload_errors source_table connection primary_key ~action =
handler caught_panic =
error_mapper = connection.dialect.get_error_mapper
sql_error = caught_panic.payload
case error_mapper.is_primary_key_violation sql_error of
True -> Error.throw (Non_Unique_Primary_Key.Error primary_key)
True -> raise_duplicated_primary_key_error source_table primary_key caught_panic
False -> Panic.throw caught_panic
Panic.catch SQL_Error action handler
## PRIVATE
Creates a `Non_Unique_Primary_Key` error containing information about an
example group violating the uniqueness constraint.
raise_duplicated_primary_key_error source_table primary_key original_panic =
agg = source_table.aggregate [Aggregate_Column.Count]+(primary_key.map Aggregate_Column.Group_By)
filtered = agg.filter column=0 (Filter_Condition.Greater than=1)
materialized = filtered.read max_rows=1
case materialized.row_count == 0 of
## If we couldn't find a duplicated key, we give up the translation and
rethrow the original panic containing the SQL error. This could
happen if the constraint violation is on some non-trivial key, like
case insensitive.
True -> Panic.throw original_panic
False ->
row = materialized.first_row.to_vector
example_count = row.first
example_entry = row.drop 1
Error.throw (Non_Unique_Primary_Key.Error primary_key example_entry example_count)
## PRIVATE
Creates a statement that will create a table with structure determined by the
provided columns.

View File

@ -14,11 +14,16 @@ type Image_File_Format
## PRIVATE
If the File_Format supports reading from the file, return a configured instance.
for_file : File -> Image_File_Format | Nothing
for_file file =
for_file_read : File -> Image_File_Format | Nothing
for_file_read file =
extension = file.extension
if supported.contains extension then Image_File_Format.For_File else Nothing
## PRIVATE
If this File_Format should be used for writing to that file, return a configured instance.
for_file_write : File -> Image_File_Format | Nothing
for_file_write file = Image_File_Format.for_file_read file
## PRIVATE
If the File_Format supports reading from the web response, return a configured instance.
for_web : Text -> URI -> Image_File_Format | Nothing

View File

@ -1660,6 +1660,21 @@ type Table
row_count : Integer
row_count self = self.java_table.rowCount
## Returns a materialized dataframe containing rows of this table.
In the in-memory backend, this returns the same table, truncated to
`max_rows`. This is only kept for API compatibility between database and
in-memory tables. The `read` operation can be used to ensure that the
table is now in-memory, regardless of its origin.
Arguments:
- max_rows: specifies a maximum amount of rows to fetch; if not set, all
available rows are fetched.
read : (Integer | Nothing) -> Table
read self max_rows=Nothing = case max_rows of
Nothing -> self
_ : Integer -> self.take (First max_rows)
## Returns a Table describing this table's contents.
The table lists all columns, counts of non-null items and value types of
@ -1913,7 +1928,7 @@ type Table
file = File.new path
case format of
_ : Auto_Detect ->
base_format = format.get_format file
base_format = format.get_writing_format file
if base_format == Nothing then Error.throw (File_Error.Unsupported_Output_Type file Table) else
self.write file format=base_format on_existing_file match_columns on_problems
_ ->

View File

@ -54,14 +54,19 @@ type Delimited_Format
## PRIVATE
ADVANCED
If the File_Format supports reading from the file, return a configured instance.
for_file : File -> Delimited_Format | Nothing
for_file file =
for_file_read : File -> Delimited_Format | Nothing
for_file_read file =
case file.extension of
".csv" -> Delimited_Format.Delimited ','
".tab" -> Delimited_Format.Delimited '\t'
".tsv" -> Delimited_Format.Delimited '\t'
_ -> Nothing
## PRIVATE
If this File_Format should be used for writing to that file, return a configured instance.
for_file_write : File -> Delimited_Format | Nothing
for_file_write file = Delimited_Format.for_file_read file
## PRIVATE
ADVANCED
If the File_Format supports reading from the web response, return a configured instance.

View File

@ -12,6 +12,13 @@ polyglot java import org.enso.table.error.EmptySheetException
type Missing_Input_Columns
## PRIVATE
One or more columns not found in the input table.
Arguments:
- criteria: the names of the columns or regular expressions that did not
have any matches.
- where: an optional text describing to which object this error is
related to (for example in join, whether the reported error is for the
left or right table).
Error (criteria : [Text]) (where:Text|Nothing = Nothing)
## PRIVATE

View File

@ -48,12 +48,17 @@ type Excel_Format
## PRIVATE
ADVANCED
If the File_Format supports reading from the file, return a configured instance.
for_file : File -> Excel_Format | Nothing
for_file file =
for_file_read : File -> Excel_Format | Nothing
for_file_read file =
is_xls = should_treat_as_xls_format Infer file
if is_xls.is_error then Nothing else
Excel_Format.Excel xls_format=is_xls
## PRIVATE
If this File_Format should be used for writing to that file, return a configured instance.
for_file_write : File -> Excel_Format | Nothing
for_file_write file = Excel_Format.for_file_read file
## PRIVATE
ADVANCED
If the File_Format supports reading from the web response, return a configured instance.

View File

@ -284,7 +284,8 @@ Error.should_succeed self frames_to_skip=0 =
## Handles an unexpected dataflow error.
Error.should_be_a : Integer -> Any
Error.should_be_a self frames_to_skip=0 =
Error.should_be_a self typ frames_to_skip=0 =
_ = typ
Test.fail_match_on_unexpected_error self 1+frames_to_skip
## Asserts that the given `Boolean` is `True`

View File

@ -69,9 +69,9 @@ spec =
Test.group "Image File_Format" <|
Test.specify "should recognise image files" <|
Auto_Detect.get_format (enso_project.data / "data.jpg") . should_be_a Image_File_Format
Auto_Detect.get_format (enso_project.data / "data.png") . should_be_a Image_File_Format
Auto_Detect.get_format (enso_project.data / "data.bmp") . should_be_a Image_File_Format
Auto_Detect.get_reading_format (enso_project.data / "data.jpg") . should_be_a Image_File_Format
Auto_Detect.get_reading_format (enso_project.data / "data.png") . should_be_a Image_File_Format
Auto_Detect.get_reading_format (enso_project.data / "data.bmp") . should_be_a Image_File_Format
Test.specify "should allow reading an Image" <|
img = Data.read rgba_file

View File

@ -1,6 +1,7 @@
from Standard.Base import all
import Standard.Base.Runtime.Ref.Ref
import Standard.Base.Runtime.Context
import Standard.Base.Errors.File_Error.File_Error
import Standard.Table.Data.Type.Value_Type.Bits
from Standard.Table import Table, Value_Type
@ -154,7 +155,7 @@ sqlite_spec connection prefix =
spec =
enso_project.data.create_directory
file = enso_project.data / "sqlite_test.db"
file = enso_project.data / "transient" / "sqlite_test.db"
file.delete_if_exists
in_file_prefix = "[SQLite File] "
sqlite_spec (Database.connect (SQLite file)) in_file_prefix
@ -174,11 +175,31 @@ spec =
connection.execute_update 'CREATE TABLE "Dummy" ("strs" VARCHAR, "ints" INTEGER, "bools" BOOLEAN, "reals" REAL)'
connection.close
Test.specify "should recognise a db file" <|
Auto_Detect.get_format (enso_project.data / "data.db") . should_be_a SQLite_Format
Test.specify "should recognise a SQLite database file" <|
Auto_Detect.get_reading_format file . should_be_a SQLite_Format
Test.specify "should recognise a sqlite file" <|
Auto_Detect.get_format (enso_project.data / "data.sqlite") . should_be_a SQLite_Format
Test.specify "should recognise a sqlite file by extension for writing" <|
Auto_Detect.get_writing_format (enso_project.data / "nonexistent-data.db") . should_be_a SQLite_Format
Auto_Detect.get_writing_format (enso_project.data / "nonexistent-data.sqlite") . should_be_a SQLite_Format
Test.specify "should not recognise nonexistent or empty files for reading" <|
r1 = Data.read (enso_project.data / "nonexistent-data.db")
r1.should_fail_with File_Error
r1.catch . should_be_a File_Error.Not_Found
empty = enso_project.data / "transient" / "empty-data.db"
"".write empty on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
r2 = Data.read empty
r2.should_fail_with File_Error
r2.catch . should_be_a File_Error.Unsupported_Type
empty.delete_if_exists
broken = enso_project.data / "transient" / "empty-data.db"
"SOME_RANDOM_DATA".write empty on_existing_file=Existing_File_Behavior.Overwrite . should_succeed
r3 = Data.read broken
r3.should_fail_with File_Error
r3.catch . should_be_a File_Error.Unsupported_Type
broken.delete_if_exists
Test.specify "should connect to a db file" <|
connection = Data.read file

View File

@ -104,14 +104,22 @@ spec make_new_connection prefix persistent_connector=True =
t1 = Table.new [["X", [1, 2, 1]], ["Y", ['b', 'b', 'a']]]
r1 = t1.create_database_table connection (Name_Generator.random_name "primary-key-6") temporary=True primary_key=["X"]
r1.should_fail_with Non_Unique_Primary_Key
e1 = r1.catch
e1.clashing_primary_key . should_equal [1]
e1.clashing_example_row_count . should_equal 2
e1.to_display_text . should_equal "The primary key [X] is not unique. The key [1] corresponds to 2 rows."
r2 = t1.create_database_table connection (Name_Generator.random_name "primary-key-6") temporary=True primary_key=["Y"]
r2.should_fail_with Non_Unique_Primary_Key
r2.catch . clashing_primary_key . should_equal ['b']
r3 = t1.create_database_table connection (Name_Generator.random_name "primary-key-7") temporary=True primary_key=["X", "Y"]
r3.at "X" . to_vector . should_equal [1, 2, 1]
t2 = Table.new [["X", [1, 2, 1]], ["Y", ['a', 'b', 'a']]]
r4 = t2.create_database_table connection (Name_Generator.random_name "primary-key-7") temporary=True primary_key=["X", "Y"]
r4.should_fail_with Non_Unique_Primary_Key
r4.catch . clashing_primary_key . should_equal [1, 'a']
Test.group prefix+"Persisting a Database Table (query)" <|
Test.specify "should be able to create a persistent copy of a DB table" <|
@ -193,6 +201,22 @@ spec make_new_connection prefix persistent_connector=True =
r1 = db_table.create_database_table connection (Name_Generator.random_name "copied-table") temporary=True primary_key=["X"]
r1.should_fail_with Non_Unique_Primary_Key
e1 = r1.catch
e1.clashing_primary_key . should_equal [1]
e1.clashing_example_row_count . should_equal 2
t2 = Table.new [["X", [1, 3, 1, 2, 3, 2, 2, 2, 0]], ["Y", ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']]]
db_table_2 = t2.create_database_table connection (Name_Generator.random_name "source-table-2") temporary=True primary_key=Nothing
Problems.assume_no_problems db_table_2
r2 = db_table_2.create_database_table connection (Name_Generator.random_name "copied-table-2") temporary=True primary_key=["X"]
r2.should_fail_with Non_Unique_Primary_Key
e2 = r2.catch
e2.clashing_primary_key.length . should_equal 1
x = e2.clashing_primary_key.first
[1, 2, 3].should_contain x
counts = Map.from_vector [[1, 2], [2, 4], [3, 2]]
e2.clashing_example_row_count . should_equal (counts.at x)
Test.specify "will not allow to upload tables across connections" <|
t = Table.new [["X", [1, 2, 1]], ["Y", ['b', 'b', 'a']]]

View File

@ -346,6 +346,8 @@ type_spec name alter = Test.group name <|
alter [1, 2, 3, 4, 5, 6] . short_display_text max_entries=3 . should_equal "[1, 2, 3 and 3 more elements]"
alter (0.up_to 100).to_vector . short_display_text max_entries=2 . should_equal "[0, 1 and 98 more elements]"
alter [1, 2] . to_display_text . should_equal "[1, 2]"
alter [] . short_display_text max_entries=0 . should_fail_with Illegal_Argument
Test.specify "should define equality" <|

View File

@ -25,7 +25,7 @@ spec =
list = ArrayList.new
list.add 432
list.get 0 . should_equal 432
Test.specify "should report missing method error on Java Arrays" <|
Test.specify "should report missing method error on Java Arrays" pending="Failing due to #6609" <|
list = ArrayList.new
list.add 432
Test.expect_panic_with (list.asList) No_Such_Method

View File

@ -20,6 +20,11 @@ spec =
content = sample_txt.read
content.should_equal "Hello World!"
Test.specify "should raise a not-found error when reading a nonexistent file even of unknown format" <|
r1 = (File.new "nonexistent.file.of.weird-format").read
r1.should_fail_with File_Error
r1.catch.should_be_a File_Error.Not_Found
Test.group "Bytes" <|
Test.specify "should be able to read a file as Bytes" <|
bytes = sample_xxx.read Bytes