Give file read its own helper widget for delimiters. (#8627)

Give file read its own helper widget for delimiters. Remove newline add none. The file read delimiter is similar but different to the split one and so should have its own set of options.
2024-10-26 21:19:02 +03:00 · 2024-01-04 11:59:42 +00:00 · 2024-01-04 11:59:42 +00:00 · bf8dd1888c
commit bf8dd1888c
parent 20531d51df
6 changed files with 33 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -599,6 +599,8 @@
 - [Support for loading big Excel files.][8403]
 - [Added new `Filter_Condition`s - `Equal_Ignore_Case`, `Is_Nan`, `Is_Infinite`
  and `Is_Finite`.][8539]
+- [Added text_length to Column][8606]
+- [Added none delimiter option for Data.Read][8627]

 [debug-shortcuts]:
  https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -858,6 +860,8 @@
 [8403]: https://github.com/enso-org/enso/pull/8403
 [8539]: https://github.com/enso-org/enso/pull/8539
 [8564]: https://github.com/enso-org/enso/pull/8564
+[8606]: https://github.com/enso-org/enso/pull/8606
+[8627]: https://github.com/enso-org/enso/pull/8627

 #### Enso Compiler

@ -1011,7 +1015,6 @@
 - [Upgrade GraalVM to 23.1.0 JDK21][7991]
 - [Added opt-in type checks of return type][8502]
 - [DataflowError.withoutTrace doesn't store stacktrace][8608]
- [Added text_length to Column][8606]

 [3227]: https://github.com/enso-org/enso/pull/3227
 [3248]: https://github.com/enso-org/enso/pull/3248
@ -1164,7 +1167,6 @@
 [7991]: https://github.com/enso-org/enso/pull/7991
 [8502]: https://github.com/enso-org/enso/pull/8502
 [8608]: https://github.com/enso-org/enso/pull/8608
-[8606]: https://github.com/enso-org/enso/pull/8606

 # Enso 2.0.0-alpha.18 (2021-10-12)

--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Widget_Helpers.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Widget_Helpers.enso
@ -19,6 +19,12 @@ make_delimiter_selector : Widget
 make_delimiter_selector =
    make_single_choice [',', ';', '|', ['{tab}', "'\t'"], ['{space}', "' '"], ['{newline}', "['\n', '\r\n', '\r']"], '_', ['Custom', "'?'"]]

+ ## PRIVATE
+      Creates a Single_Choice Widget for file read delimiters.
+make_file_read_delimiter_selector : Widget
+make_file_read_delimiter_selector =
+        make_single_choice [',', ';', '|', ['{tab}', "'\t'"], ['{space}', "' '"], ['{none}', "''"], '_', ['Custom', "'?'"]]
+
 ## PRIVATE
   Creates a Single_Choice Widget for parsing dates.
 make_date_format_selector : Date -> Widget
--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Delimited/Delimited_Format.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Delimited/Delimited_Format.enso
@ -2,7 +2,7 @@ from Standard.Base import all
 import Standard.Base.Network.HTTP.Response.Response
 import Standard.Base.System.File_Format_Metadata.File_Format_Metadata
 import Standard.Base.System.Input_Stream.Input_Stream
-from Standard.Base.Widget_Helpers import make_delimiter_selector
+from Standard.Base.Widget_Helpers import make_file_read_delimiter_selector

 import project.Data.Data_Formatter.Data_Formatter
 import project.Data.Match_Columns.Match_Columns
@ -52,7 +52,7 @@ type Delimited_Format
         character if it anywhere else than at the beginning of the line. This
         option is only applicable for read mode and does not affect writing. It
         defaults to `Nothing` which means that comments are disabled.
-    @delimiter make_delimiter_selector
+    @delimiter make_file_read_delimiter_selector
    @encoding Encoding.default_widget
    Delimited (delimiter:Text=',') (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Boolean|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter.Value) (keep_invalid_rows:Boolean=True) (line_endings:Line_Ending_Style|Infer=Infer) (comment_character:Text|Nothing=Nothing)

--- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java
+++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java
@ -42,7 +42,9 @@ public class DelimitedReader {
   * <p>I considered to choose `\u0F8EE` which comes from the Private Use Area of the Basic
   * Multilingual Plane. Is has no meaning designated by the Unicode standard.
   */
-  public static final char UNUSED_CHARACTER = '\0';
+  public static final char COMMENT_CHARACTER = '\0';
+
+  public static final char UNUSED_CHARACTER = '\uF8EE';

  private static final String COLUMN_NAME = "Column";
  private static final char noQuoteCharacter = '\0';
@ -113,15 +115,17 @@ public class DelimitedReader {
      String commentCharacter,
      boolean warningsAsErrors,
      ProblemAggregator problemAggregator) {
-    if (delimiter.isEmpty()) {
-      throw new IllegalArgumentException("Empty delimiters are not supported.");
-    }
    if (delimiter.length() > 1) {
      throw new IllegalArgumentException(
          "Delimiters consisting of multiple characters or code units are not supported.");
    }
-
-    this.delimiter = delimiter.charAt(0);
+    if (delimiter.isEmpty()) {
+      // User wants to read each row into a single cell. So we delimit on a character that we assume
+      // is not in user data
+      this.delimiter = UNUSED_CHARACTER;
+    } else {
+      this.delimiter = delimiter.charAt(0);
+    }

    if (quote != null) {
      if (quote.isEmpty()) {
@ -198,7 +202,7 @@ public class DelimitedReader {
    }

    if (commentCharacter == null) {
-      format.setComment(UNUSED_CHARACTER);
+      format.setComment(COMMENT_CHARACTER);
    } else {
      if (commentCharacter.length() != 1) {
        throw new IllegalArgumentException(
--- a/std-bits/table/src/main/java/org/enso/table/write/DelimitedWriter.java
+++ b/std-bits/table/src/main/java/org/enso/table/write/DelimitedWriter.java
@ -207,7 +207,7 @@ public class DelimitedWriter {
       * TODO This should be checking if commenting is enabled, but currently
       * due to limitations of the reader library it is always enabled, just
       * sometimes the comment char is set to `\0`. See the documentation of
-       * {@link DelimitedReader#UNUSED_CHARACTER}.
+       * {@link DelimitedReader#COMMENT_CHARACTER}.
       *
       * See issue https://github.com/enso-org/enso/issues/5655
       */
--- a/test/Table_Tests/src/IO/Delimited_Read_Spec.enso
+++ b/test/Table_Tests/src/IO/Delimited_Read_Spec.enso
@ -398,6 +398,15 @@ spec =
            t2.at "c" . to_vector . should_equal [3, 6]
            t2.columns.map .name . should_equal ["a", "b", "c"]

+        Test.specify "should be able to read in a file without splitting it to columns" <|
+            t1 = (enso_project.data / "data_small.csv") . read (Delimited "" headers=False)
+            expected = ['Code,Index,Flag,Value,ValueWithNothing,TextWithNothing,"Hexadecimal",Leading0s,QuotedNumbers,"Mixed Types"']
+                + ['gxl,7,True,38.76109,63.13,   pq6igd2wyd  ,4DD4675B,001,"1","33"']
+                + ['wca,0,False,-66.77495,31,"  2pr4102wc4  ",,002,"2",']
+                + ['nfw,1,  True ,  88.65713\t\t\t,-68.71,"",01896EAB,123,,45']
+                + ['der,7,True,0.86658,,,F32E1EFE,,"34",True']
+            t1.at 0 . to_vector . should_equal expected
+
        Test.specify "should be able to parse raw text" <|
            text1 = """
                a,b,c