Give file read its own helper widget for delimiters. (#8627)

Give file read its own helper widget for delimiters. Remove newline add none. The file read delimiter is similar but different to the split one and so should have its own set of options.
This commit is contained in:
AdRiley 2024-01-04 11:59:42 +00:00 committed by GitHub
parent 20531d51df
commit bf8dd1888c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 33 additions and 12 deletions

View File

@ -599,6 +599,8 @@
- [Support for loading big Excel files.][8403]
- [Added new `Filter_Condition`s - `Equal_Ignore_Case`, `Is_Nan`, `Is_Infinite`
and `Is_Finite`.][8539]
- [Added text_length to Column][8606]
- [Added none delimiter option for Data.Read][8627]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -858,6 +860,8 @@
[8403]: https://github.com/enso-org/enso/pull/8403
[8539]: https://github.com/enso-org/enso/pull/8539
[8564]: https://github.com/enso-org/enso/pull/8564
[8606]: https://github.com/enso-org/enso/pull/8606
[8627]: https://github.com/enso-org/enso/pull/8627
#### Enso Compiler
@ -1011,7 +1015,6 @@
- [Upgrade GraalVM to 23.1.0 JDK21][7991]
- [Added opt-in type checks of return type][8502]
- [DataflowError.withoutTrace doesn't store stacktrace][8608]
- [Added text_length to Column][8606]
[3227]: https://github.com/enso-org/enso/pull/3227
[3248]: https://github.com/enso-org/enso/pull/3248
@ -1164,7 +1167,6 @@
[7991]: https://github.com/enso-org/enso/pull/7991
[8502]: https://github.com/enso-org/enso/pull/8502
[8608]: https://github.com/enso-org/enso/pull/8608
[8606]: https://github.com/enso-org/enso/pull/8606
# Enso 2.0.0-alpha.18 (2021-10-12)

View File

@ -19,6 +19,12 @@ make_delimiter_selector : Widget
make_delimiter_selector =
make_single_choice [',', ';', '|', ['{tab}', "'\t'"], ['{space}', "' '"], ['{newline}', "['\n', '\r\n', '\r']"], '_', ['Custom', "'?'"]]
## PRIVATE
Creates a Single_Choice Widget for file read delimiters.
make_file_read_delimiter_selector : Widget
make_file_read_delimiter_selector =
make_single_choice [',', ';', '|', ['{tab}', "'\t'"], ['{space}', "' '"], ['{none}', "''"], '_', ['Custom', "'?'"]]
## PRIVATE
Creates a Single_Choice Widget for parsing dates.
make_date_format_selector : Date -> Widget

View File

@ -2,7 +2,7 @@ from Standard.Base import all
import Standard.Base.Network.HTTP.Response.Response
import Standard.Base.System.File_Format_Metadata.File_Format_Metadata
import Standard.Base.System.Input_Stream.Input_Stream
from Standard.Base.Widget_Helpers import make_delimiter_selector
from Standard.Base.Widget_Helpers import make_file_read_delimiter_selector
import project.Data.Data_Formatter.Data_Formatter
import project.Data.Match_Columns.Match_Columns
@ -52,7 +52,7 @@ type Delimited_Format
character if it anywhere else than at the beginning of the line. This
option is only applicable for read mode and does not affect writing. It
defaults to `Nothing` which means that comments are disabled.
@delimiter make_delimiter_selector
@delimiter make_file_read_delimiter_selector
@encoding Encoding.default_widget
Delimited (delimiter:Text=',') (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Boolean|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter.Value) (keep_invalid_rows:Boolean=True) (line_endings:Line_Ending_Style|Infer=Infer) (comment_character:Text|Nothing=Nothing)

View File

@ -42,7 +42,9 @@ public class DelimitedReader {
* <p>I considered to choose `\u0F8EE` which comes from the Private Use Area of the Basic
* Multilingual Plane. Is has no meaning designated by the Unicode standard.
*/
public static final char UNUSED_CHARACTER = '\0';
public static final char COMMENT_CHARACTER = '\0';
public static final char UNUSED_CHARACTER = '\uF8EE';
private static final String COLUMN_NAME = "Column";
private static final char noQuoteCharacter = '\0';
@ -113,15 +115,17 @@ public class DelimitedReader {
String commentCharacter,
boolean warningsAsErrors,
ProblemAggregator problemAggregator) {
if (delimiter.isEmpty()) {
throw new IllegalArgumentException("Empty delimiters are not supported.");
}
if (delimiter.length() > 1) {
throw new IllegalArgumentException(
"Delimiters consisting of multiple characters or code units are not supported.");
}
this.delimiter = delimiter.charAt(0);
if (delimiter.isEmpty()) {
// User wants to read each row into a single cell. So we delimit on a character that we assume
// is not in user data
this.delimiter = UNUSED_CHARACTER;
} else {
this.delimiter = delimiter.charAt(0);
}
if (quote != null) {
if (quote.isEmpty()) {
@ -198,7 +202,7 @@ public class DelimitedReader {
}
if (commentCharacter == null) {
format.setComment(UNUSED_CHARACTER);
format.setComment(COMMENT_CHARACTER);
} else {
if (commentCharacter.length() != 1) {
throw new IllegalArgumentException(

View File

@ -207,7 +207,7 @@ public class DelimitedWriter {
* TODO This should be checking if commenting is enabled, but currently
* due to limitations of the reader library it is always enabled, just
* sometimes the comment char is set to `\0`. See the documentation of
* {@link DelimitedReader#UNUSED_CHARACTER}.
* {@link DelimitedReader#COMMENT_CHARACTER}.
*
* See issue https://github.com/enso-org/enso/issues/5655
*/

View File

@ -398,6 +398,15 @@ spec =
t2.at "c" . to_vector . should_equal [3, 6]
t2.columns.map .name . should_equal ["a", "b", "c"]
Test.specify "should be able to read in a file without splitting it to columns" <|
t1 = (enso_project.data / "data_small.csv") . read (Delimited "" headers=False)
expected = ['Code,Index,Flag,Value,ValueWithNothing,TextWithNothing,"Hexadecimal",Leading0s,QuotedNumbers,"Mixed Types"']
+ ['gxl,7,True,38.76109,63.13, pq6igd2wyd ,4DD4675B,001,"1","33"']
+ ['wca,0,False,-66.77495,31," 2pr4102wc4 ",,002,"2",']
+ ['nfw,1, True , 88.65713\t\t\t,-68.71,"",01896EAB,123,,45']
+ ['der,7,True,0.86658,,,F32E1EFE,,"34",True']
t1.at 0 . to_vector . should_equal expected
Test.specify "should be able to parse raw text" <|
text1 = """
a,b,c