Use memoized in built untrimmed count.

This commit is contained in:
James Dunkerley 2024-11-20 16:31:46 +00:00
parent d8330c9bc5
commit 57b255ebc3
5 changed files with 130 additions and 16 deletions

View File

@ -40,6 +40,7 @@ from project.Internal.Storage import enso_to_java, java_to_enso
polyglot java import org.enso.base.Time_Utils
polyglot java import org.enso.table.data.column.operation.cast.CastProblemAggregator
polyglot java import org.enso.table.data.column.operation.CountNothing
polyglot java import org.enso.table.data.column.operation.CountUntrimmed
polyglot java import org.enso.table.data.column.operation.unary.DatePartOperation
polyglot java import org.enso.table.data.column.operation.unary.IsEmptyOperation
polyglot java import org.enso.table.data.column.operation.unary.IsFiniteOperation
@ -2213,6 +2214,14 @@ type Column
count_nothing : Integer
count_nothing self = CountNothing.apply self.java_column
## PRIVATE
Counts the number of text values with leading or trailing whitespace.
Used for data quality indicator in Table Viz.
count_untrimmed : Integer | Nothing
count_untrimmed self =
if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else
CountUntrimmed.apply self.java_column
## GROUP Standard.Base.Metadata
ICON metadata
Returns the number of non-null items in this column.

View File

@ -32,14 +32,13 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error
_ : Row -> make_json_for_dictionary x.to_dictionary max_rows "column"
_ : Column -> prepare_visualization x.to_table max_rows
_ : Table ->
dataframe = x.take max_rows
all_rows_count = x.row_count
make_json_for_table dataframe all_rows_count True False
make_json_for_table x max_rows all_rows_count True False
_ : DB_Column -> prepare_visualization x.to_table max_rows
_ : DB_Table ->
dataframe = x.read (..First max_rows)
all_rows_count = x.row_count
make_json_for_table dataframe all_rows_count True True
make_json_for_table dataframe max_rows all_rows_count True True
_ : Function ->
pairs = [['_display_text_', '[Function '+x.to_text+']']]
value = JS_Object.from_pairs pairs
@ -59,14 +58,6 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error
Column Limit
max_columns = 250
## PRIVATE
whitespace_count : Column -> Integer | Nothing
whitespace_count col =
find_whitespace col =
filtered = col.to_vector.filter (c-> c.is_a Text && c.is_empty.not && (c.first.is_whitespace || c.last.is_whitespace))
filtered.length
if (col.value_type == Value_Type.Mixed || col.value_type.is_text) then find_whitespace col else Nothing
## PRIVATE
Render Error to JSON
make_json_for_error : Any -> JS_Object
@ -187,9 +178,10 @@ make_json_for_xml_element xml_element max_items type:Text="XML_Element" =
to display.
- all_rows_count: the number of all rows in the underlying data, useful if
only a fragment is displayed.
make_json_for_table : Table -> Integer -> Boolean -> Boolean -> JS_Object
make_json_for_table dataframe all_rows_count include_index_col is_db_table =
get_vector c = Warning.set (c.to_vector.map v-> make_json_for_value v) []
make_json_for_table : Table -> Integer -> Integer -> Boolean -> Boolean -> JS_Object
make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_table =
act_max = if max_rows < all_rows_count then max_rows else all_rows_count
get_vector c = Warning.set (Vector.new act_max i-> make_json_for_value (c.get i)) []
columns = dataframe.columns
header = ["header", columns.map .name]
value_type = ["value_type", columns.map .value_type]
@ -197,8 +189,8 @@ make_json_for_table dataframe all_rows_count include_index_col is_db_table =
all_rows = ["all_rows_count", all_rows_count]
has_index_col = ["has_index_col", include_index_col]
links = ["get_child_node_action", "get_row"]
number_of_nothing = if is_db_table then Nothing else columns.map c-> c.count_nothing
number_of_whitespace= if is_db_table then Nothing else columns.map c-> whitespace_count c
number_of_nothing = if is_db_table then Nothing else columns.map .count_nothing
number_of_whitespace= if is_db_table then Nothing else columns.map .count_untrimmed
data_quality_pairs = JS_Object.from_pairs [["number_of_nothing", number_of_nothing], ["number_of_whitespace", number_of_whitespace]]
pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_pairs", data_quality_pairs] ,["type", "Table"]]
JS_Object.from_pairs pairs

View File

@ -35,6 +35,64 @@ public class Text_Utils {
return string.substring(from, to);
}
/**
* Gets the first Grapheme cluster in the string.
*
* @param string the string to substring
* @return the first grapheme cluster in the string or null if the string is empty.
*/
public static String first_cluster(String string) {
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
breakIterator.setText(string);
int start = breakIterator.first();
if (start == -1) {
return null;
}
int end = breakIterator.next();
return string.substring(start, end);
}
/**
* Gets the last Grapheme cluster in the string.
*
* @param string the string to substring
* @return the last grapheme cluster in the string or null if the string is empty.
*/
public static String last_cluster(String string) {
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
breakIterator.setText(string);
int start = breakIterator.last();
if (start == -1) {
return null;
}
int end = breakIterator.previous();
return string.substring(end, start);
}
/**
* Checks if the string has leading or trailing whitespace.
*
* @param s the string to check
* @return whether the string has leading or trailing whitespace
*/
public static boolean has_leading_trailing_whitespace(String s) {
if (s == null && s.isEmpty()) {
return false;
}
var leading = Text_Utils.first_cluster(s);
if (leading != null && is_all_whitespace(leading)) {
return true;
}
var trailing = Text_Utils.last_cluster(s);
if (trailing != null && is_all_whitespace(trailing)) {
return true;
}
return false;
}
/**
* Returns a new string containing characters starting at the given UTF-16 index.
*

View File

@ -0,0 +1,39 @@
package org.enso.table.data.column.operation;
import org.enso.base.Text_Utils;
import org.enso.table.data.column.storage.ColumnStorage;
import org.enso.table.data.column.storage.StringStorage;
import org.enso.table.data.table.Column;
import org.graalvm.polyglot.Context;
public class CountUntrimmed {
/** Counts the number of cells in the columns with leading or trailing whitespace. */
public static long apply(Column column) {
ColumnStorage storage = column.getStorage();
return applyToStorage(storage);
}
/** Counts the number of cells in the given storage with leading or trailing whitespace. */
public static long applyToStorage(ColumnStorage storage) {
if (storage instanceof StringStorage stringStorage) {
return stringStorage.countLeadingTrailingWhitespace();
}
return compute(storage);
}
/** Internal method performing the calculation on a storage. */
public static long compute(ColumnStorage storage) {
Context context = Context.getCurrent();
long count = 0;
for (long i = 0; i < storage.getSize(); i++) {
var val = storage.getItemAsObject(i);
if (val instanceof String str) {
if (Text_Utils.has_leading_trailing_whitespace(str)) {
count++;
}
}
context.safepoint();
}
return count;
}
}

View File

@ -3,6 +3,7 @@ package org.enso.table.data.column.storage;
import java.util.BitSet;
import org.enso.base.CompareException;
import org.enso.base.Text_Utils;
import org.enso.table.data.column.operation.CountUntrimmed;
import org.enso.table.data.column.operation.map.BinaryMapOperation;
import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
import org.enso.table.data.column.operation.map.MapOperationStorage;
@ -20,6 +21,7 @@ import org.graalvm.polyglot.Context;
public final class StringStorage extends SpecializedStorage<String> {
private final TextType type;
private long _countLeadingTrailingWhitespace = -1;
/**
* @param data the underlying data
@ -46,6 +48,20 @@ public final class StringStorage extends SpecializedStorage<String> {
return type;
}
/**
* Counts the number of cells in the columns with whitespace.
* Memoized into the storage for performance.
* @return the number of cells with whitespace
*/
public Long countLeadingTrailingWhitespace() {
if (_countLeadingTrailingWhitespace >= 0) {
return _countLeadingTrailingWhitespace;
}
_countLeadingTrailingWhitespace = CountUntrimmed.compute(this);
return _countLeadingTrailingWhitespace;
}
private static MapOperationStorage<String, SpecializedStorage<String>> buildOps() {
MapOperationStorage<String, SpecializedStorage<String>> t = ObjectStorage.buildObjectOps();
t.add(