Use memoized in built untrimmed count.

2024-11-22 11:52:59 +03:00 · 2024-11-20 16:31:46 +00:00 · 2024-11-20 16:31:46 +00:00 · 57b255ebc3
commit 57b255ebc3
parent d8330c9bc5
5 changed files with 130 additions and 16 deletions
--- a/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso
+++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Column.enso
@ -40,6 +40,7 @@ from project.Internal.Storage import enso_to_java, java_to_enso
 polyglot java import org.enso.base.Time_Utils
 polyglot java import org.enso.table.data.column.operation.cast.CastProblemAggregator
 polyglot java import org.enso.table.data.column.operation.CountNothing
+polyglot java import org.enso.table.data.column.operation.CountUntrimmed
 polyglot java import org.enso.table.data.column.operation.unary.DatePartOperation
 polyglot java import org.enso.table.data.column.operation.unary.IsEmptyOperation
 polyglot java import org.enso.table.data.column.operation.unary.IsFiniteOperation
@ -2213,6 +2214,14 @@ type Column
    count_nothing : Integer
    count_nothing self = CountNothing.apply self.java_column

+    ## PRIVATE
+       Counts the number of text values with leading or trailing whitespace.
+       Used for data quality indicator in Table Viz.
+    count_untrimmed : Integer | Nothing
+    count_untrimmed self =
+        if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else
+            CountUntrimmed.apply self.java_column
+
    ## GROUP Standard.Base.Metadata
       ICON metadata
       Returns the number of non-null items in this column.
--- a/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso
+++ b/distribution/lib/Standard/Visualization/0.0.0-dev/src/Table/Visualization.enso
@ -32,14 +32,13 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error
        _ : Row -> make_json_for_dictionary x.to_dictionary max_rows "column"
        _ : Column -> prepare_visualization x.to_table max_rows
        _ : Table ->
-            dataframe = x.take max_rows
            all_rows_count = x.row_count
-            make_json_for_table dataframe all_rows_count True False
+            make_json_for_table x max_rows all_rows_count True False
        _ : DB_Column -> prepare_visualization x.to_table max_rows
        _ : DB_Table ->
            dataframe = x.read (..First max_rows)
            all_rows_count = x.row_count
-            make_json_for_table dataframe all_rows_count True True
+            make_json_for_table dataframe max_rows all_rows_count True True
        _ : Function ->
            pairs = [['_display_text_', '[Function '+x.to_text+']']]
            value = JS_Object.from_pairs pairs
@ -59,14 +58,6 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error
   Column Limit
 max_columns = 250

-## PRIVATE
-whitespace_count : Column -> Integer | Nothing
-whitespace_count col =
-    find_whitespace col = 
-        filtered = col.to_vector.filter (c-> c.is_a Text && c.is_empty.not && (c.first.is_whitespace || c.last.is_whitespace))
-        filtered.length
-    if (col.value_type == Value_Type.Mixed || col.value_type.is_text) then find_whitespace col else Nothing 
-
 ## PRIVATE
   Render Error to JSON
 make_json_for_error : Any -> JS_Object
@ -187,9 +178,10 @@ make_json_for_xml_element xml_element max_items type:Text="XML_Element" =
     to display.
   - all_rows_count: the number of all rows in the underlying data, useful if
     only a fragment is displayed.
-make_json_for_table : Table -> Integer -> Boolean -> Boolean -> JS_Object
-make_json_for_table dataframe all_rows_count include_index_col is_db_table =
-    get_vector c = Warning.set (c.to_vector.map v-> make_json_for_value v) []
+make_json_for_table : Table -> Integer -> Integer -> Boolean -> Boolean -> JS_Object
+make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_table =
+    act_max = if max_rows < all_rows_count then max_rows else all_rows_count
+    get_vector c = Warning.set (Vector.new act_max i-> make_json_for_value (c.get i)) []
    columns     = dataframe.columns
    header      = ["header", columns.map .name]
    value_type  = ["value_type", columns.map .value_type]
@ -197,8 +189,8 @@ make_json_for_table dataframe all_rows_count include_index_col is_db_table =
    all_rows    = ["all_rows_count", all_rows_count]
    has_index_col = ["has_index_col", include_index_col]
    links       = ["get_child_node_action", "get_row"]
-    number_of_nothing = if is_db_table then Nothing else columns.map c-> c.count_nothing
-    number_of_whitespace= if is_db_table then Nothing else columns.map c-> whitespace_count c
+    number_of_nothing = if is_db_table then Nothing else columns.map .count_nothing
+    number_of_whitespace= if is_db_table then Nothing else columns.map .count_untrimmed
    data_quality_pairs = JS_Object.from_pairs [["number_of_nothing", number_of_nothing], ["number_of_whitespace", number_of_whitespace]]
    pairs       = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_pairs", data_quality_pairs] ,["type", "Table"]]
    JS_Object.from_pairs pairs
--- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
+++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
@ -35,6 +35,64 @@ public class Text_Utils {
    return string.substring(from, to);
  }

+  /**
+   * Gets the first Grapheme cluster in the string.
+   *
+   * @param string the string to substring
+   * @return the first grapheme cluster in the string or null if the string is empty.
+   */
+  public static String first_cluster(String string) {
+    BreakIterator breakIterator = BreakIterator.getCharacterInstance();
+    breakIterator.setText(string);
+    int start = breakIterator.first();
+    if (start == -1) {
+      return null;
+    }
+    int end = breakIterator.next();
+    return string.substring(start, end);
+  }
+
+  /**
+   * Gets the last Grapheme cluster in the string.
+   *
+   * @param string the string to substring
+   * @return the last grapheme cluster in the string or null if the string is empty.
+   */
+  public static String last_cluster(String string) {
+    BreakIterator breakIterator = BreakIterator.getCharacterInstance();
+    breakIterator.setText(string);
+    int start = breakIterator.last();
+    if (start == -1) {
+      return null;
+    }
+    int end = breakIterator.previous();
+    return string.substring(end, start);
+  }
+
+  /**
+   * Checks if the string has leading or trailing whitespace.
+   *
+   * @param s the string to check
+   * @return whether the string has leading or trailing whitespace
+   */
+  public static boolean has_leading_trailing_whitespace(String s) {
+    if (s == null && s.isEmpty()) {
+      return false;
+    }
+
+    var leading = Text_Utils.first_cluster(s);
+    if (leading != null && is_all_whitespace(leading)) {
+      return true;
+    }
+
+    var trailing = Text_Utils.last_cluster(s);
+    if (trailing != null && is_all_whitespace(trailing)) {
+      return true;
+    }
+
+    return false;
+  }
+
  /**
   * Returns a new string containing characters starting at the given UTF-16 index.
   *
--- a/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java
+++ b/std-bits/table/src/main/java/org/enso/table/data/column/operation/CountUntrimmed.java
@ -0,0 +1,39 @@
+package org.enso.table.data.column.operation;
+
+import org.enso.base.Text_Utils;
+import org.enso.table.data.column.storage.ColumnStorage;
+import org.enso.table.data.column.storage.StringStorage;
+import org.enso.table.data.table.Column;
+import org.graalvm.polyglot.Context;
+
+public class CountUntrimmed {
+  /** Counts the number of cells in the columns with leading or trailing whitespace. */
+  public static long apply(Column column) {
+    ColumnStorage storage = column.getStorage();
+    return applyToStorage(storage);
+  }
+
+  /** Counts the number of cells in the given storage with leading or trailing whitespace. */
+  public static long applyToStorage(ColumnStorage storage) {
+    if (storage instanceof StringStorage stringStorage) {
+      return stringStorage.countLeadingTrailingWhitespace();
+    }
+    return compute(storage);
+  }
+
+  /** Internal method performing the calculation on a storage. */
+  public static long compute(ColumnStorage storage) {
+    Context context = Context.getCurrent();
+    long count = 0;
+    for (long i = 0; i < storage.getSize(); i++) {
+      var val = storage.getItemAsObject(i);
+      if (val instanceof String str) {
+        if (Text_Utils.has_leading_trailing_whitespace(str)) {
+          count++;
+        }
+      }
+      context.safepoint();
+    }
+    return count;
+  }
+}
--- a/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java
+++ b/std-bits/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java
@ -3,6 +3,7 @@ package org.enso.table.data.column.storage;
 import java.util.BitSet;
 import org.enso.base.CompareException;
 import org.enso.base.Text_Utils;
+import org.enso.table.data.column.operation.CountUntrimmed;
 import org.enso.table.data.column.operation.map.BinaryMapOperation;
 import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
 import org.enso.table.data.column.operation.map.MapOperationStorage;
@ -20,6 +21,7 @@ import org.graalvm.polyglot.Context;
 public final class StringStorage extends SpecializedStorage<String> {

  private final TextType type;
+  private long _countLeadingTrailingWhitespace = -1;

  /**
   * @param data the underlying data
@ -46,6 +48,20 @@ public final class StringStorage extends SpecializedStorage<String> {
    return type;
  }

+  /**
+   * Counts the number of cells in the columns with whitespace.
+   * Memoized into the storage for performance.
+   * @return the number of cells with whitespace
+   */
+  public Long countLeadingTrailingWhitespace() {
+    if (_countLeadingTrailingWhitespace >= 0) {
+      return _countLeadingTrailingWhitespace;
+    }
+
+    _countLeadingTrailingWhitespace = CountUntrimmed.compute(this);
+    return _countLeadingTrailingWhitespace;
+  }
+
  private static MapOperationStorage<String, SpecializedStorage<String>> buildOps() {
    MapOperationStorage<String, SpecializedStorage<String>> t = ObjectStorage.buildObjectOps();
    t.add(