Add new text_left and text_right functions (#8691)

Added text_left and text_right functions for in-memory and databases
This commit is contained in:
AdRiley 2024-01-15 23:43:23 +00:00 committed by GitHub
parent 943b857de1
commit b8e93b3cba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 272 additions and 5 deletions

View File

@ -601,6 +601,7 @@
and `Is_Finite`.][8539]
- [Added text_length to Column][8606]
- [Added none delimiter option for Data.Read][8627]
- [Added text_left and text_right to Column][8691]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -862,6 +863,7 @@
[8564]: https://github.com/enso-org/enso/pull/8564
[8606]: https://github.com/enso-org/enso/pull/8606
[8627]: https://github.com/enso-org/enso/pull/8627
[8691]: https://github.com/enso-org/enso/pull/8691
#### Enso Compiler

View File

@ -1195,6 +1195,48 @@ type Column
new_name = self.naming_helper.function_name "text_length" [self]
self.make_unary_op "LENGTH" new_name
## GROUP Standard.Base.Text
ICON preparation
Gets the left n characters for each element of the column.
In the Database backends, the default text left method of the
particular database is used.
In the in-memory backend, this will give you the left n graphemes of the string.
> Example
import Standard.Examples
example_text_length =
Examples.text_column_1.text_left 5
text_left : Column|Integer -> Column
text_left self n =
Value_Type.expect_text self <| Value_Type.expect_integer n <|
n2 = n.max 0
new_name = self.naming_helper.function_name "text_left" [self, n]
self.make_binary_op "LEFT" n2 new_name
## GROUP Standard.Base.Text
ICON preparation
Gets the right n characters for each element of the column.
In the Database backends, the default text right method of the
particular database is used.
In the in-memory backend, this will give you the right n graphemes of the string.
> Example
import Standard.Examples
example_text_length =
Examples.text_column_1.text_right 5
text_right : Column|Integer -> Column
text_right self n =
Value_Type.expect_text self <| Value_Type.expect_integer n <|
n2 = n.max 0
new_name = self.naming_helper.function_name "text_right" [self, n]
self.make_binary_op "RIGHT" n2 new_name
## GROUP Standard.Base.Logical
Checks for each element of the column if it contains `other`.

View File

@ -287,7 +287,7 @@ type Postgres_Dialect
## PRIVATE
make_internal_generator_dialect =
cases = [["LOWER", Base_Generator.make_function "LOWER"], ["UPPER", Base_Generator.make_function "UPPER"]]
text = [starts_with, contains, ends_with, agg_shortest, agg_longest, make_case_sensitive, ["REPLACE", replace]]+concat_ops+cases+trim_ops
text = [starts_with, contains, ends_with, agg_shortest, agg_longest, make_case_sensitive, ["REPLACE", replace], left, right]+concat_ops+cases+trim_ops
counts = [agg_count_is_null, agg_count_empty, agg_count_not_empty, ["COUNT_DISTINCT", agg_count_distinct], ["COUNT_DISTINCT_INCLUDE_NULL", agg_count_distinct_include_null]]
arith_extensions = [is_nan, is_inf, floating_point_div, mod_op, decimal_div, decimal_mod, ["ROW_MIN", Base_Generator.make_function "LEAST"], ["ROW_MAX", Base_Generator.make_function "GREATEST"]]
bool = [bool_or]
@ -486,6 +486,14 @@ make_contains_expr expr substring =
## PRIVATE
contains = Base_Generator.lift_binary_op "contains" make_contains_expr
## PRIVATE
left = Base_Generator.lift_binary_op "LEFT" str-> n->
Builder.code "left(" ++ str ++ ", CAST(" ++ n ++ " AS INT))"
## PRIVATE
right = Base_Generator.lift_binary_op "RIGHT" str-> n->
Builder.code "right(" ++ str ++ ", CAST(" ++ n ++ " AS INT))"
## PRIVATE
make_order_descriptor internal_column sort_direction text_ordering =
nulls = case sort_direction of

View File

@ -282,7 +282,7 @@ type SQLite_Dialect
## PRIVATE
make_internal_generator_dialect =
text = [starts_with, contains, ends_with, make_case_sensitive, ["REPLACE", replace]]+concat_ops+trim_ops
text = [starts_with, contains, ends_with, make_case_sensitive, ["REPLACE", replace], left, right]+concat_ops+trim_ops
counts = [agg_count_is_null, agg_count_empty, agg_count_not_empty, ["COUNT_DISTINCT", agg_count_distinct], ["COUNT_DISTINCT_INCLUDE_NULL", agg_count_distinct_include_null]]
stats = [agg_stddev_pop, agg_stddev_samp]
arith_extensions = [is_inf, floating_point_div, mod_op]
@ -409,6 +409,14 @@ make_contains_expr expr substring =
## PRIVATE
contains = Base_Generator.lift_binary_op "contains" make_contains_expr
## PRIVATE
left = Base_Generator.lift_binary_op "LEFT" str-> n->
Builder.code "substr(" ++ str ++ ", 0, " ++ n ++ " + 1)"
## PRIVATE
right = Base_Generator.lift_binary_op "RIGHT" str-> n->
Builder.code "substr(" ++ str ++ ", -" ++ n ++ ", " ++ n ++ ")"
## PRIVATE
bool_or = Base_Generator.lift_unary_op "BOOL_OR" arg->
Builder.code "max(" ++ arg ++ ")"

View File

@ -184,7 +184,7 @@ operations_map =
always_boolean_ops = ["==", "!=", "equals_ignore_case", ">=", "<=", "<", ">", "BETWEEN", "AND", "OR", "NOT", "IS_NULL", "IS_EMPTY", "LIKE", "IS_IN", "IS_IN_COLUMN", "starts_with", "ends_with", "contains", "BOOL_OR", "IS_INF"]
always_floating_ops = ["/", "mod", "AVG", "STDDEV_POP", "STDDEV_SAMP", "ROUND"]
always_text_ops = ["ADD_TEXT", "CONCAT", "CONCAT_QUOTE_IF_NEEDED", "MAKE_CASE_SENSITIVE", "FOLD_CASE", "TRIM", "LTRIM", "RTRIM", "REPLACE"]
always_text_ops = ["ADD_TEXT", "CONCAT", "CONCAT_QUOTE_IF_NEEDED", "MAKE_CASE_SENSITIVE", "FOLD_CASE", "TRIM", "LTRIM", "RTRIM", "REPLACE", "LEFT", "RIGHT"]
always_integer_ops = ["COUNT", "COUNT_IS_NULL", "COUNT_DISTINCT", "COUNT_DISTINCT_INCLUDE_NULL", "COUNT_EMPTY", "COUNT_NOT_EMPTY", "COUNT_ROWS", "ROW_NUMBER", "ROW_NUMBER_IN_GROUP", "LENGTH"]
same_as_first = ["TRUNCATE", "CEIL", "FLOOR"]
arithmetic_ops = ["ADD_NUMBER", "-", "*", "^", "%", "SUM"]

View File

@ -1256,6 +1256,48 @@ type Column
Value_Type.expect_text self <|
simple_unary_op self Java_Storage.Maps.TEXT_LENGTH
## GROUP Standard.Base.Text
ICON preparation
Gets the left n characters for each element of the column.
In the Database backends, the default text left method of the
particular database is used.
In the in-memory backend, this will give you the left n graphemes of the string.
> Example
import Standard.Examples
example_text_length =
Examples.text_column_1.text_left 5
text_left : Column|Integer -> Column
text_left self n =
Value_Type.expect_text self <|
Value_Type.expect_integer n <|
new_name = naming_helper.function_name "text_left" [self, n]
run_vectorized_binary_op self Java_Storage.Maps.TEXT_LEFT n new_name
## GROUP Standard.Base.Text
ICON preparation
Gets the right n characters for each element of the column.
In the Database backends, the default text right method of the
particular database is used.
In the in-memory backend, this will give you the right n graphemes of the string.
> Example
import Standard.Examples
example_text_length =
Examples.text_column_1.text_right 5
text_right : Column|Integer -> Column
text_right self n =
Value_Type.expect_text self <|
Value_Type.expect_integer n <|
new_name = naming_helper.function_name "text_right" [self, n]
run_vectorized_binary_op self Java_Storage.Maps.TEXT_RIGHT n new_name
## GROUP Standard.Base.Logical
Checks for each element of the column if it contains `other`.

View File

@ -52,7 +52,9 @@ public class Core_Text_Utils {
public static String take_prefix(String str, long grapheme_length) {
BreakIterator iter = BreakIterator.getCharacterInstance();
iter.setText(str);
if (iter.next(Math.toIntExact(grapheme_length)) == BreakIterator.DONE) {
if (grapheme_length <= 0) {
return "";
} else if (iter.next(Math.toIntExact(grapheme_length)) == BreakIterator.DONE) {
return str;
} else {
return str.substring(0, iter.current());

View File

@ -294,7 +294,9 @@ public class Text_Utils {
BreakIterator iter = BreakIterator.getCharacterInstance();
iter.setText(str);
iter.last();
if (iter.next(Math.toIntExact(-grapheme_length)) == BreakIterator.DONE) {
if (grapheme_length <= 0) {
return "";
} else if (iter.next(Math.toIntExact(-grapheme_length)) == BreakIterator.DONE) {
return str;
} else {
return str.substring(iter.current());

View File

@ -0,0 +1,75 @@
package org.enso.table.data.column.operation.map.text;
import org.enso.table.data.column.builder.StringBuilder;
import org.enso.table.data.column.operation.map.BinaryMapOperation;
import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
import org.enso.table.data.column.storage.SpecializedStorage;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.data.column.storage.numeric.LongStorage;
import org.enso.table.data.column.storage.type.TextType;
import org.enso.table.error.UnexpectedTypeException;
import org.graalvm.polyglot.Context;
public abstract class StringLongToStringOp
extends BinaryMapOperation<String, SpecializedStorage<String>> {
public StringLongToStringOp(String name) {
super(name);
}
protected abstract String doOperation(String a, long b);
@Override
public Storage<?> runBinaryMap(
SpecializedStorage<String> storage,
Object arg,
MapOperationProblemAggregator problemAggregator) {
int size = storage.size();
if (arg == null) {
StringBuilder builder = new StringBuilder(size, TextType.VARIABLE_LENGTH);
builder.appendNulls(size);
return builder.seal();
} else if (arg instanceof Long argLong) {
String[] newVals = new String[size];
Context context = Context.getCurrent();
for (int i = 0; i < size; i++) {
if (storage.isNa(i)) {
newVals[i] = null;
} else {
newVals[i] = doOperation(storage.getItem(i), argLong);
}
context.safepoint();
}
return new StringStorage(newVals, size, (TextType) storage.getType());
} else {
throw new UnexpectedTypeException("a Text");
}
}
@Override
public Storage<?> runZip(
SpecializedStorage<String> storage,
Storage<?> arg,
MapOperationProblemAggregator problemAggregator) {
if (arg instanceof LongStorage v) {
int size = storage.size();
String[] newVals = new String[size];
Context context = Context.getCurrent();
for (int i = 0; i < size; i++) {
if (storage.isNa(i) || v.isNa(i)) {
newVals[i] = null;
} else {
newVals[i] = doOperation(storage.getItem(i), v.getItem(i));
}
context.safepoint();
}
return new StringStorage(newVals, size, (TextType) storage.getType());
} else {
throw new UnexpectedTypeException("a Text column");
}
}
}

View File

@ -110,6 +110,8 @@ public abstract class Storage<T> {
public static final String STARTS_WITH = "starts_with";
public static final String ENDS_WITH = "ends_with";
public static final String TEXT_LENGTH = "text_length";
public static final String TEXT_LEFT = "text_left";
public static final String TEXT_RIGHT = "text_right";
public static final String CONTAINS = "contains";
public static final String LIKE = "like";
public static final String IS_IN = "is_in";

View File

@ -10,6 +10,7 @@ import org.enso.table.data.column.operation.map.numeric.UnaryIntegerOp;
import org.enso.table.data.column.operation.map.text.LikeOp;
import org.enso.table.data.column.operation.map.text.StringBooleanOp;
import org.enso.table.data.column.operation.map.text.StringIsInOp;
import org.enso.table.data.column.operation.map.text.StringLongToStringOp;
import org.enso.table.data.column.operation.map.text.StringStringOp;
import org.enso.table.data.column.storage.type.StorageType;
import org.enso.table.data.column.storage.type.TextType;
@ -129,6 +130,20 @@ public final class StringStorage extends SpecializedStorage<String> {
return Text_Utils.grapheme_length(a);
}
});
t.add(
new StringLongToStringOp(Maps.TEXT_LEFT) {
@Override
protected String doOperation(String a, long b) {
return Text_Utils.take_prefix(a, b);
}
});
t.add(
new StringLongToStringOp(Maps.TEXT_RIGHT) {
@Override
protected String doOperation(String a, long b) {
return Text_Utils.take_suffix(a, b);
}
});
t.add(
new StringBooleanOp(Maps.CONTAINS) {
@Override

View File

@ -925,6 +925,71 @@ spec setup =
t = table_builder [["numbers", [1, 2, 3]]]
col = t.at "numbers"
col.text_length . should_fail_with Invalid_Value_Type
Test.specify "should handle operation text_left and text_right with length 1" <|
with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
col = t.at "strings" . cast (Value_Type.Char size=286 variable_length=True)
resLeft = col.text_left 1
resLeft.name . should_equal "text_left([strings], 1)"
resRight = col.text_right 1
resRight.name . should_equal "text_right([strings], 1)"
resLeft . to_vector . should_equal ["a", "f", "", Nothing, "c", "I"]
resRight . to_vector . should_equal ["a", "r", "", Nothing, "é", "."]
case setup.is_database of
False -> resLeft . value_type . should_equal (Value_Type.Char size=286 variable_length=True)
True -> resLeft . value_type . should_equal (Value_Type.Char variable_length=True)
case setup.is_database of
False -> resRight . value_type . should_equal (Value_Type.Char size=286 variable_length=True)
True -> resRight . value_type . should_equal (Value_Type.Char variable_length=True)
Test.specify "should handle operation text_left and text_right of grapheme and non-grapheme" <|
with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "👩🔬👩🔬V👩🔬👩🔬", "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
col = t.at "strings"
resLeft = col.text_left 3
resLeft.name . should_equal "text_left([strings], 3)"
resRight = col.text_right 3
resRight.name . should_equal "text_right([strings], 3)"
case setup.is_database of
False -> resLeft . to_vector . should_equal ["a", "foo", "", Nothing, "👩🔬👩🔬V", "caf", "It "] # Grapheme Length
True -> resLeft . to_vector . should_equal ["a", "foo", "", Nothing, "👩‍🔬", "caf", "It "] # Storage Length
case setup.is_database of
False -> resRight . to_vector . should_equal ["a", "bar", "", Nothing, "V👩🔬👩🔬", "afé", "ir."] # Grapheme Length
True -> resRight . to_vector . should_equal ["a", "bar", "", Nothing, "👩‍🔬", "afé", "ir."] # Storage Length
Test.specify "text_left and text_right should error on non-string columns" <|
t = table_builder [["numbers", [1, 2, 3]]]
col = t.at "numbers"
col.text_left 6 . should_fail_with Invalid_Value_Type
col.text_right 6 . should_fail_with Invalid_Value_Type
Test.specify "text_left and text_right should error on non integer parameters" <|
t = table_builder [["numbers", [1, 2, 3]]]
col = t.at "numbers"
col.text_left 3.14 . should_fail_with Invalid_Value_Type
col.text_right 3.14 . should_fail_with Invalid_Value_Type
col.text_left "7" . should_fail_with Invalid_Value_Type
col.text_left "7" . should_fail_with Invalid_Value_Type
Test.specify "text_left and text_right should return empty on zero argument" <|
with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "👩🔬👩🔬V👩🔬👩🔬", "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
col = t.at "strings"
resLeft = col.text_left 0
resLeft.name . should_equal "text_left([strings], 0)"
resRight = col.text_right 0
resRight.name . should_equal "text_right([strings], 0)"
resLeft . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
resRight . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
Test.specify "text_left and text_right should return empty on negative arguments" <|
with_mixed_columns_if_supported [["strings", ["a", "foobar", "", Nothing, "👩🔬👩🔬V👩🔬👩🔬", "café", "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair."]]] t->
col = t.at "strings"
resLeft = col.text_left -3
resLeft.name . should_equal "text_left([strings], -3)"
resRight = col.text_right -3
resRight.name . should_equal "text_right([strings], -3)"
resLeft . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
resRight . to_vector . should_equal ["", "", "", Nothing, "", "", ""]
Test.specify "should handle operations like is_empty, is_blank, fill_empty" <|
with_mixed_columns_if_supported [["s", ["", " ", " ", Nothing, "foo"]], ["letters", ["a", "b", "c", "d", "e"]]] t->

View File

@ -70,6 +70,8 @@ spec =
Text_Utils.take_prefix txt 5 . should_equal txt
Text_Utils.take_prefix txt 400 . should_equal txt
Text_Utils.take_prefix txt 0 . should_equal ''
Text_Utils.take_prefix txt -1 . should_equal ''
Text_Utils.take_prefix txt -42 . should_equal ''
Text_Utils.take_suffix txt 1 . should_equal 'c\u0301'
Text_Utils.take_suffix txt 2 . should_equal 'śc\u0301'
@ -78,6 +80,8 @@ spec =
Text_Utils.take_suffix txt 5 . should_equal txt
Text_Utils.take_suffix txt 400 . should_equal txt
Text_Utils.take_suffix txt 0 . should_equal ''
Text_Utils.take_suffix txt -1 . should_equal ''
Text_Utils.take_suffix txt -42 . should_equal ''
Text_Utils.take_prefix '🚀🚧' 1 . should_equal '🚀'
Text_Utils.take_prefix '🚀🚧' 2 . should_equal '🚀🚧'