Add new text_left and text_right functions (#8691)

Added text_left and text_right functions for in-memory and databases
2024-12-23 04:43:26 +03:00 · 2024-01-15 23:43:23 +00:00 · 2024-01-15 23:43:23 +00:00 · b8e93b3cba
commit b8e93b3cba
parent 943b857de1
13 changed files with 272 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -601,6 +601,7 @@
  and `Is_Finite`.][8539]
 - [Added text_length to Column][8606]
 - [Added none delimiter option for Data.Read][8627]
+- [Added text_left and text_right to Column][8691]

 [debug-shortcuts]:
  https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -862,6 +863,7 @@
 [8564]: https://github.com/enso-org/enso/pull/8564
 [8606]: https://github.com/enso-org/enso/pull/8606
 [8627]: https://github.com/enso-org/enso/pull/8627
+[8691]: https://github.com/enso-org/enso/pull/8691

 #### Enso Compiler

--- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso
+++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Column.enso
@ -1195,6 +1195,48 @@ type Column
            new_name = self.naming_helper.function_name "text_length" [self]
            self.make_unary_op "LENGTH" new_name

+    ## GROUP Standard.Base.Text
+       ICON preparation
+       Gets the left n characters for each element of the column.
+
+       In the Database backends, the default text left method of the
+       particular database is used.
+
+       In the in-memory backend, this will give you the left n graphemes of the string.
+
+       > Example
+             import Standard.Examples
+
+             example_text_length =
+                Examples.text_column_1.text_left 5
+    text_left : Column|Integer -> Column
+    text_left self n =
+        Value_Type.expect_text self <| Value_Type.expect_integer n <|
+            n2 = n.max 0
+            new_name = self.naming_helper.function_name "text_left" [self, n]
+            self.make_binary_op "LEFT" n2 new_name
+
+    ## GROUP Standard.Base.Text
+       ICON preparation
+       Gets the right n characters for each element of the column.
+
+       In the Database backends, the default text right method of the
+       particular database is used.
+
+       In the in-memory backend, this will give you the right n graphemes of the string.
+
+       > Example
+             import Standard.Examples
+
+             example_text_length =
+                Examples.text_column_1.text_right 5
+    text_right : Column|Integer -> Column
+    text_right self n =
+        Value_Type.expect_text self <| Value_Type.expect_integer n <|
+            n2 = n.max 0
+            new_name = self.naming_helper.function_name "text_right" [self, n]
+            self.make_binary_op "RIGHT" n2 new_name           
+
    ## GROUP Standard.Base.Logical
       Checks for each element of the column if it contains `other`.

--- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso
+++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/Postgres/Postgres_Dialect.enso
@ -287,7 +287,7 @@ type Postgres_Dialect
 ## PRIVATE
 make_internal_generator_dialect =
    cases = [["LOWER", Base_Generator.make_function "LOWER"], ["UPPER", Base_Generator.make_function "UPPER"]]
-    text = [starts_with, contains, ends_with, agg_shortest, agg_longest, make_case_sensitive, ["REPLACE", replace]]+concat_ops+cases+trim_ops
+    text = [starts_with, contains, ends_with, agg_shortest, agg_longest, make_case_sensitive, ["REPLACE", replace], left, right]+concat_ops+cases+trim_ops
    counts = [agg_count_is_null, agg_count_empty, agg_count_not_empty, ["COUNT_DISTINCT", agg_count_distinct], ["COUNT_DISTINCT_INCLUDE_NULL", agg_count_distinct_include_null]]
    arith_extensions = [is_nan, is_inf, floating_point_div, mod_op, decimal_div, decimal_mod, ["ROW_MIN", Base_Generator.make_function "LEAST"], ["ROW_MAX", Base_Generator.make_function "GREATEST"]]
    bool = [bool_or]
@ -486,6 +486,14 @@ make_contains_expr expr substring =
 ## PRIVATE
 contains = Base_Generator.lift_binary_op "contains" make_contains_expr

+## PRIVATE
+left = Base_Generator.lift_binary_op "LEFT" str-> n->
+    Builder.code "left(" ++ str ++ ", CAST(" ++ n ++ " AS INT))"
+
+## PRIVATE
+right = Base_Generator.lift_binary_op "RIGHT" str-> n->
+    Builder.code "right(" ++ str ++ ", CAST(" ++ n ++ " AS INT))"
+
 ## PRIVATE
 make_order_descriptor internal_column sort_direction text_ordering =
    nulls = case sort_direction of
--- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso
+++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Dialect.enso
@ -282,7 +282,7 @@ type SQLite_Dialect

 ## PRIVATE
 make_internal_generator_dialect =
-    text = [starts_with, contains, ends_with, make_case_sensitive, ["REPLACE", replace]]+concat_ops+trim_ops
+    text = [starts_with, contains, ends_with, make_case_sensitive, ["REPLACE", replace], left, right]+concat_ops+trim_ops
    counts = [agg_count_is_null, agg_count_empty, agg_count_not_empty, ["COUNT_DISTINCT", agg_count_distinct], ["COUNT_DISTINCT_INCLUDE_NULL", agg_count_distinct_include_null]]
    stats = [agg_stddev_pop, agg_stddev_samp]
    arith_extensions = [is_inf, floating_point_div, mod_op]
@ -409,6 +409,14 @@ make_contains_expr expr substring =
 ## PRIVATE
 contains = Base_Generator.lift_binary_op "contains" make_contains_expr

+## PRIVATE
+left = Base_Generator.lift_binary_op "LEFT" str-> n->
+    Builder.code "substr(" ++ str ++ ", 0, " ++ n ++ " + 1)"
+
+## PRIVATE
+right = Base_Generator.lift_binary_op "RIGHT" str-> n->
+    Builder.code "substr(" ++ str ++ ", -" ++ n ++ ", " ++ n ++ ")"
+
 ## PRIVATE
 bool_or = Base_Generator.lift_unary_op "BOOL_OR" arg->
    Builder.code "max(" ++ arg ++ ")"
--- a/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Type_Mapping.enso
+++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Internal/SQLite/SQLite_Type_Mapping.enso
@ -184,7 +184,7 @@ operations_map =

    always_boolean_ops = ["==", "!=", "equals_ignore_case", ">=", "<=", "<", ">", "BETWEEN", "AND", "OR", "NOT", "IS_NULL", "IS_EMPTY", "LIKE", "IS_IN", "IS_IN_COLUMN", "starts_with", "ends_with", "contains", "BOOL_OR", "IS_INF"]
    always_floating_ops = ["/", "mod", "AVG", "STDDEV_POP", "STDDEV_SAMP", "ROUND"]
-    always_text_ops = ["ADD_TEXT", "CONCAT", "CONCAT_QUOTE_IF_NEEDED", "MAKE_CASE_SENSITIVE", "FOLD_CASE", "TRIM", "LTRIM", "RTRIM", "REPLACE"]
+    always_text_ops = ["ADD_TEXT", "CONCAT", "CONCAT_QUOTE_IF_NEEDED", "MAKE_CASE_SENSITIVE", "FOLD_CASE", "TRIM", "LTRIM", "RTRIM", "REPLACE", "LEFT", "RIGHT"]
    always_integer_ops = ["COUNT", "COUNT_IS_NULL", "COUNT_DISTINCT", "COUNT_DISTINCT_INCLUDE_NULL", "COUNT_EMPTY", "COUNT_NOT_EMPTY", "COUNT_ROWS", "ROW_NUMBER", "ROW_NUMBER_IN_GROUP", "LENGTH"]
    same_as_first = ["TRUNCATE", "CEIL", "FLOOR"]
    arithmetic_ops = ["ADD_NUMBER", "-", "*", "^", "%", "SUM"]
--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Column.enso
@ -1256,6 +1256,48 @@ type Column
        Value_Type.expect_text self <|
            simple_unary_op self Java_Storage.Maps.TEXT_LENGTH

+    ## GROUP Standard.Base.Text
+       ICON preparation
+       Gets the left n characters for each element of the column.
+
+       In the Database backends, the default text left method of the
+       particular database is used.
+
+       In the in-memory backend, this will give you the left n graphemes of the string.
+
+       > Example
+             import Standard.Examples
+
+             example_text_length =
+                Examples.text_column_1.text_left 5
+    text_left : Column|Integer -> Column
+    text_left self n =
+        Value_Type.expect_text self <|
+            Value_Type.expect_integer n <|
+                new_name = naming_helper.function_name "text_left" [self, n]
+                run_vectorized_binary_op self Java_Storage.Maps.TEXT_LEFT n new_name
+
+    ## GROUP Standard.Base.Text
+       ICON preparation
+       Gets the right n characters for each element of the column.
+
+       In the Database backends, the default text right method of the
+       particular database is used.
+
+       In the in-memory backend, this will give you the right n graphemes of the string.
+
+       > Example
+             import Standard.Examples
+
+             example_text_length =
+                Examples.text_column_1.text_right 5
+    text_right : Column|Integer -> Column
+    text_right self n =
+        Value_Type.expect_text self <|
+            Value_Type.expect_integer n <|
+                new_name = naming_helper.function_name "text_right" [self, n]
+                run_vectorized_binary_op self Java_Storage.Maps.TEXT_RIGHT n new_name
+
    ## GROUP Standard.Base.Logical
       Checks for each element of the column if it contains `other`.

--- a/lib/scala/common-polyglot-core-utils/src/main/java/org/enso/polyglot/common_utils/Core_Text_Utils.java
+++ b/lib/scala/common-polyglot-core-utils/src/main/java/org/enso/polyglot/common_utils/Core_Text_Utils.java
@ -52,7 +52,9 @@ public class Core_Text_Utils {
  public static String take_prefix(String str, long grapheme_length) {
    BreakIterator iter = BreakIterator.getCharacterInstance();
    iter.setText(str);
-    if (iter.next(Math.toIntExact(grapheme_length)) == BreakIterator.DONE) {
+    if (grapheme_length <= 0) {
+      return "";
+    } else if (iter.next(Math.toIntExact(grapheme_length)) == BreakIterator.DONE) {
      return str;
    } else {
      return str.substring(0, iter.current());
--- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
+++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
@ -294,7 +294,9 @@ public class Text_Utils {
    BreakIterator iter = BreakIterator.getCharacterInstance();
    iter.setText(str);
    iter.last();
-    if (iter.next(Math.toIntExact(-grapheme_length)) == BreakIterator.DONE) {
+    if (grapheme_length <= 0) {
+      return "";
+    } else if (iter.next(Math.toIntExact(-grapheme_length)) == BreakIterator.DONE) {
      return str;
    } else {
      return str.substring(iter.current());
--- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/StringLongToStringOp.java
+++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/map/text/StringLongToStringOp.java
@ -0,0 +1,75 @@
+package org.enso.table.data.column.operation.map.text;
+
+import org.enso.table.data.column.builder.StringBuilder;
+import org.enso.table.data.column.operation.map.BinaryMapOperation;
+import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
+import org.enso.table.data.column.storage.SpecializedStorage;
+import org.enso.table.data.column.storage.Storage;
+import org.enso.table.data.column.storage.StringStorage;
+import org.enso.table.data.column.storage.numeric.LongStorage;
+import org.enso.table.data.column.storage.type.TextType;
+import org.enso.table.error.UnexpectedTypeException;
+import org.graalvm.polyglot.Context;
+
+public abstract class StringLongToStringOp
+    extends BinaryMapOperation<String, SpecializedStorage<String>> {
+  public StringLongToStringOp(String name) {
+    super(name);
+  }
+
+  protected abstract String doOperation(String a, long b);
+
+  @Override
+  public Storage<?> runBinaryMap(
+      SpecializedStorage<String> storage,
+      Object arg,
+      MapOperationProblemAggregator problemAggregator) {
+    int size = storage.size();
+    if (arg == null) {
+      StringBuilder builder = new StringBuilder(size, TextType.VARIABLE_LENGTH);
+      builder.appendNulls(size);
+      return builder.seal();
+    } else if (arg instanceof Long argLong) {
+      String[] newVals = new String[size];
+      Context context = Context.getCurrent();
+      for (int i = 0; i < size; i++) {
+        if (storage.isNa(i)) {
+          newVals[i] = null;
+        } else {
+          newVals[i] = doOperation(storage.getItem(i), argLong);
+        }
+
+        context.safepoint();
+      }
+
+      return new StringStorage(newVals, size, (TextType) storage.getType());
+    } else {
+      throw new UnexpectedTypeException("a Text");
+    }
+  }
+
+  @Override
+  public Storage<?> runZip(
+      SpecializedStorage<String> storage,
+      Storage<?> arg,
+      MapOperationProblemAggregator problemAggregator) {
+    if (arg instanceof LongStorage v) {
+      int size = storage.size();
+      String[] newVals = new String[size];
+      Context context = Context.getCurrent();
+      for (int i = 0; i < size; i++) {
+        if (storage.isNa(i) || v.isNa(i)) {
+          newVals[i] = null;
+        } else {
+          newVals[i] = doOperation(storage.getItem(i), v.getItem(i));
+        }
+
+        context.safepoint();
+      }
+
+      return new StringStorage(newVals, size, (TextType) storage.getType());
+    } else {
+      throw new UnexpectedTypeException("a Text column");
+    }
+  }
+}
--- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java
+++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/Storage.java
@ -110,6 +110,8 @@ public abstract class Storage<T> {
    public static final String STARTS_WITH = "starts_with";
    public static final String ENDS_WITH = "ends_with";
    public static final String TEXT_LENGTH = "text_length";
+    public static final String TEXT_LEFT = "text_left";
+    public static final String TEXT_RIGHT = "text_right";
    public static final String CONTAINS = "contains";
    public static final String LIKE = "like";
    public static final String IS_IN = "is_in";
--- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java
+++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java
@ -10,6 +10,7 @@ import org.enso.table.data.column.operation.map.numeric.UnaryIntegerOp;
 import org.enso.table.data.column.operation.map.text.LikeOp;
 import org.enso.table.data.column.operation.map.text.StringBooleanOp;
 import org.enso.table.data.column.operation.map.text.StringIsInOp;
+import org.enso.table.data.column.operation.map.text.StringLongToStringOp;
 import org.enso.table.data.column.operation.map.text.StringStringOp;
 import org.enso.table.data.column.storage.type.StorageType;
 import org.enso.table.data.column.storage.type.TextType;
@ -129,6 +130,20 @@ public final class StringStorage extends SpecializedStorage<String> {
            return Text_Utils.grapheme_length(a);
          }
        });
+    t.add(
+        new StringLongToStringOp(Maps.TEXT_LEFT) {
+          @Override
+          protected String doOperation(String a, long b) {
+            return Text_Utils.take_prefix(a, b);
+          }
+        });
+    t.add(
+        new StringLongToStringOp(Maps.TEXT_RIGHT) {
+          @Override
+          protected String doOperation(String a, long b) {
+            return Text_Utils.take_suffix(a, b);
+          }
+        });
    t.add(
        new StringBooleanOp(Maps.CONTAINS) {
          @Override
--- a/test/Table_Tests/src/Common_Table_Operations/Column_Operations_Spec.enso
+++ b/test/Table_Tests/src/Common_Table_Operations/Column_Operations_Spec.enso
@ -925,6 +925,71 @@ spec setup =
            t = table_builder [["numbers", [1, 2, 3]]]
            col = t.at "numbers"
            col.text_length . should_fail_with Invalid_Value_Type
+        
+        Test.specify "should handle operation text_left and text_right with length 1" <|
+            with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
+                col = t.at "strings" . cast (Value_Type.Char size=286 variable_length=True)
+                resLeft = col.text_left 1
+                resLeft.name . should_equal "text_left([strings], 1)"
+                resRight = col.text_right 1
+                resRight.name . should_equal "text_right([strings], 1)"
+                resLeft . to_vector . should_equal ["a", "f", "", Nothing, "c", "I"] 
+                resRight . to_vector . should_equal ["a", "r", "", Nothing, "é", "."]
+                case setup.is_database of
+                    False -> resLeft . value_type . should_equal (Value_Type.Char size=286 variable_length=True)
+                    True -> resLeft . value_type . should_equal (Value_Type.Char variable_length=True)
+                case setup.is_database of
+                    False -> resRight . value_type . should_equal (Value_Type.Char size=286 variable_length=True)
+                    True -> resRight . value_type . should_equal (Value_Type.Char variable_length=True)
+        
+        Test.specify "should handle operation text_left and text_right of grapheme and non-grapheme" <|
+            with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "👩‍🔬👩‍🔬V👩‍🔬👩‍🔬", "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
+                col = t.at "strings"
+                resLeft = col.text_left 3
+                resLeft.name . should_equal "text_left([strings], 3)"
+                resRight = col.text_right 3
+                resRight.name . should_equal "text_right([strings], 3)"
+
+                case setup.is_database of
+                    False -> resLeft . to_vector . should_equal ["a", "foo", "", Nothing, "👩‍🔬👩‍🔬V", "caf", "It "] # Grapheme Length
+                    True -> resLeft . to_vector . should_equal ["a", "foo", "", Nothing, "👩‍🔬", "caf", "It "]  # Storage Length
+                case setup.is_database of
+                    False -> resRight . to_vector . should_equal ["a", "bar", "", Nothing, "V👩‍🔬👩‍🔬", "afé", "ir."] # Grapheme Length
+                    True -> resRight . to_vector . should_equal ["a", "bar", "", Nothing, "👩‍🔬", "afé", "ir."]  # Storage Length
+        
+        Test.specify "text_left and text_right should error on non-string columns" <|
+            t = table_builder [["numbers", [1, 2, 3]]]
+            col = t.at "numbers"
+            col.text_left 6 . should_fail_with Invalid_Value_Type
+            col.text_right 6 . should_fail_with Invalid_Value_Type
+
+        Test.specify "text_left and text_right should error on non integer parameters" <|
+            t = table_builder [["numbers", [1, 2, 3]]]
+            col = t.at "numbers"
+            col.text_left 3.14 . should_fail_with Invalid_Value_Type
+            col.text_right 3.14 . should_fail_with Invalid_Value_Type
+            col.text_left "7" . should_fail_with Invalid_Value_Type
+            col.text_left "7" . should_fail_with Invalid_Value_Type
+
+        Test.specify "text_left and text_right should return empty on zero argument" <|
+            with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "👩‍🔬👩‍🔬V👩‍🔬👩‍🔬", "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
+                col = t.at "strings"
+                resLeft = col.text_left 0
+                resLeft.name . should_equal "text_left([strings], 0)"
+                resRight = col.text_right 0
+                resRight.name . should_equal "text_right([strings], 0)"
+                resLeft . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
+                resRight . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
+
+        Test.specify "text_left and text_right should return empty on negative arguments" <|
+            with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "👩‍🔬👩‍🔬V👩‍🔬👩‍🔬", "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
+                col = t.at "strings"
+                resLeft = col.text_left -3
+                resLeft.name . should_equal "text_left([strings], -3)"
+                resRight = col.text_right -3
+                resRight.name . should_equal "text_right([strings], -3)"
+                resLeft . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
+                resRight . to_vector . should_equal ["", "", "", Nothing, "", "", ""]

        Test.specify "should handle operations like is_empty, is_blank, fill_empty" <|
            with_mixed_columns_if_supported [["s", ["", " ", "  ", Nothing, "foo"]], ["letters", ["a", "b", "c", "d", "e"]]] t->
--- a/test/Tests/src/Data/Text/Utils_Spec.enso
+++ b/test/Tests/src/Data/Text/Utils_Spec.enso
@ -70,6 +70,8 @@ spec =
            Text_Utils.take_prefix txt 5 . should_equal txt
            Text_Utils.take_prefix txt 400 . should_equal txt
            Text_Utils.take_prefix txt 0 . should_equal ''
+            Text_Utils.take_prefix txt -1 . should_equal ''
+            Text_Utils.take_prefix txt -42 . should_equal ''

            Text_Utils.take_suffix txt 1 . should_equal 'c\u0301'
            Text_Utils.take_suffix txt 2 . should_equal 'śc\u0301'
@ -78,6 +80,8 @@ spec =
            Text_Utils.take_suffix txt 5 . should_equal txt
            Text_Utils.take_suffix txt 400 . should_equal txt
            Text_Utils.take_suffix txt 0 . should_equal ''
+            Text_Utils.take_suffix txt -1 . should_equal ''
+            Text_Utils.take_suffix txt -42 . should_equal ''

            Text_Utils.take_prefix '🚀🚧' 1 . should_equal '🚀'
            Text_Utils.take_prefix '🚀🚧' 2 . should_equal '🚀🚧'