mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 11:52:59 +03:00
Use memoized in built untrimmed count.
This commit is contained in:
parent
d8330c9bc5
commit
57b255ebc3
@ -40,6 +40,7 @@ from project.Internal.Storage import enso_to_java, java_to_enso
|
||||
polyglot java import org.enso.base.Time_Utils
|
||||
polyglot java import org.enso.table.data.column.operation.cast.CastProblemAggregator
|
||||
polyglot java import org.enso.table.data.column.operation.CountNothing
|
||||
polyglot java import org.enso.table.data.column.operation.CountUntrimmed
|
||||
polyglot java import org.enso.table.data.column.operation.unary.DatePartOperation
|
||||
polyglot java import org.enso.table.data.column.operation.unary.IsEmptyOperation
|
||||
polyglot java import org.enso.table.data.column.operation.unary.IsFiniteOperation
|
||||
@ -2213,6 +2214,14 @@ type Column
|
||||
count_nothing : Integer
|
||||
count_nothing self = CountNothing.apply self.java_column
|
||||
|
||||
## PRIVATE
|
||||
Counts the number of text values with leading or trailing whitespace.
|
||||
Used for data quality indicator in Table Viz.
|
||||
count_untrimmed : Integer | Nothing
|
||||
count_untrimmed self =
|
||||
if (self.value_type == Value_Type.Mixed || self.value_type.is_text).not then Nothing else
|
||||
CountUntrimmed.apply self.java_column
|
||||
|
||||
## GROUP Standard.Base.Metadata
|
||||
ICON metadata
|
||||
Returns the number of non-null items in this column.
|
||||
|
@ -32,14 +32,13 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error
|
||||
_ : Row -> make_json_for_dictionary x.to_dictionary max_rows "column"
|
||||
_ : Column -> prepare_visualization x.to_table max_rows
|
||||
_ : Table ->
|
||||
dataframe = x.take max_rows
|
||||
all_rows_count = x.row_count
|
||||
make_json_for_table dataframe all_rows_count True False
|
||||
make_json_for_table x max_rows all_rows_count True False
|
||||
_ : DB_Column -> prepare_visualization x.to_table max_rows
|
||||
_ : DB_Table ->
|
||||
dataframe = x.read (..First max_rows)
|
||||
all_rows_count = x.row_count
|
||||
make_json_for_table dataframe all_rows_count True True
|
||||
make_json_for_table dataframe max_rows all_rows_count True True
|
||||
_ : Function ->
|
||||
pairs = [['_display_text_', '[Function '+x.to_text+']']]
|
||||
value = JS_Object.from_pairs pairs
|
||||
@ -59,14 +58,6 @@ prepare_visualization y max_rows=1000 = if y.is_error then (make_json_for_error
|
||||
Column Limit
|
||||
max_columns = 250
|
||||
|
||||
## PRIVATE
|
||||
whitespace_count : Column -> Integer | Nothing
|
||||
whitespace_count col =
|
||||
find_whitespace col =
|
||||
filtered = col.to_vector.filter (c-> c.is_a Text && c.is_empty.not && (c.first.is_whitespace || c.last.is_whitespace))
|
||||
filtered.length
|
||||
if (col.value_type == Value_Type.Mixed || col.value_type.is_text) then find_whitespace col else Nothing
|
||||
|
||||
## PRIVATE
|
||||
Render Error to JSON
|
||||
make_json_for_error : Any -> JS_Object
|
||||
@ -187,9 +178,10 @@ make_json_for_xml_element xml_element max_items type:Text="XML_Element" =
|
||||
to display.
|
||||
- all_rows_count: the number of all rows in the underlying data, useful if
|
||||
only a fragment is displayed.
|
||||
make_json_for_table : Table -> Integer -> Boolean -> Boolean -> JS_Object
|
||||
make_json_for_table dataframe all_rows_count include_index_col is_db_table =
|
||||
get_vector c = Warning.set (c.to_vector.map v-> make_json_for_value v) []
|
||||
make_json_for_table : Table -> Integer -> Integer -> Boolean -> Boolean -> JS_Object
|
||||
make_json_for_table dataframe max_rows all_rows_count include_index_col is_db_table =
|
||||
act_max = if max_rows < all_rows_count then max_rows else all_rows_count
|
||||
get_vector c = Warning.set (Vector.new act_max i-> make_json_for_value (c.get i)) []
|
||||
columns = dataframe.columns
|
||||
header = ["header", columns.map .name]
|
||||
value_type = ["value_type", columns.map .value_type]
|
||||
@ -197,8 +189,8 @@ make_json_for_table dataframe all_rows_count include_index_col is_db_table =
|
||||
all_rows = ["all_rows_count", all_rows_count]
|
||||
has_index_col = ["has_index_col", include_index_col]
|
||||
links = ["get_child_node_action", "get_row"]
|
||||
number_of_nothing = if is_db_table then Nothing else columns.map c-> c.count_nothing
|
||||
number_of_whitespace= if is_db_table then Nothing else columns.map c-> whitespace_count c
|
||||
number_of_nothing = if is_db_table then Nothing else columns.map .count_nothing
|
||||
number_of_whitespace= if is_db_table then Nothing else columns.map .count_untrimmed
|
||||
data_quality_pairs = JS_Object.from_pairs [["number_of_nothing", number_of_nothing], ["number_of_whitespace", number_of_whitespace]]
|
||||
pairs = [header, value_type, data, all_rows, has_index_col, links, ["data_quality_pairs", data_quality_pairs] ,["type", "Table"]]
|
||||
JS_Object.from_pairs pairs
|
||||
|
@ -35,6 +35,64 @@ public class Text_Utils {
|
||||
return string.substring(from, to);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the first Grapheme cluster in the string.
|
||||
*
|
||||
* @param string the string to substring
|
||||
* @return the first grapheme cluster in the string or null if the string is empty.
|
||||
*/
|
||||
public static String first_cluster(String string) {
|
||||
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
|
||||
breakIterator.setText(string);
|
||||
int start = breakIterator.first();
|
||||
if (start == -1) {
|
||||
return null;
|
||||
}
|
||||
int end = breakIterator.next();
|
||||
return string.substring(start, end);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the last Grapheme cluster in the string.
|
||||
*
|
||||
* @param string the string to substring
|
||||
* @return the last grapheme cluster in the string or null if the string is empty.
|
||||
*/
|
||||
public static String last_cluster(String string) {
|
||||
BreakIterator breakIterator = BreakIterator.getCharacterInstance();
|
||||
breakIterator.setText(string);
|
||||
int start = breakIterator.last();
|
||||
if (start == -1) {
|
||||
return null;
|
||||
}
|
||||
int end = breakIterator.previous();
|
||||
return string.substring(end, start);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the string has leading or trailing whitespace.
|
||||
*
|
||||
* @param s the string to check
|
||||
* @return whether the string has leading or trailing whitespace
|
||||
*/
|
||||
public static boolean has_leading_trailing_whitespace(String s) {
|
||||
if (s == null && s.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var leading = Text_Utils.first_cluster(s);
|
||||
if (leading != null && is_all_whitespace(leading)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
var trailing = Text_Utils.last_cluster(s);
|
||||
if (trailing != null && is_all_whitespace(trailing)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new string containing characters starting at the given UTF-16 index.
|
||||
*
|
||||
|
@ -0,0 +1,39 @@
|
||||
package org.enso.table.data.column.operation;
|
||||
|
||||
import org.enso.base.Text_Utils;
|
||||
import org.enso.table.data.column.storage.ColumnStorage;
|
||||
import org.enso.table.data.column.storage.StringStorage;
|
||||
import org.enso.table.data.table.Column;
|
||||
import org.graalvm.polyglot.Context;
|
||||
|
||||
public class CountUntrimmed {
|
||||
/** Counts the number of cells in the columns with leading or trailing whitespace. */
|
||||
public static long apply(Column column) {
|
||||
ColumnStorage storage = column.getStorage();
|
||||
return applyToStorage(storage);
|
||||
}
|
||||
|
||||
/** Counts the number of cells in the given storage with leading or trailing whitespace. */
|
||||
public static long applyToStorage(ColumnStorage storage) {
|
||||
if (storage instanceof StringStorage stringStorage) {
|
||||
return stringStorage.countLeadingTrailingWhitespace();
|
||||
}
|
||||
return compute(storage);
|
||||
}
|
||||
|
||||
/** Internal method performing the calculation on a storage. */
|
||||
public static long compute(ColumnStorage storage) {
|
||||
Context context = Context.getCurrent();
|
||||
long count = 0;
|
||||
for (long i = 0; i < storage.getSize(); i++) {
|
||||
var val = storage.getItemAsObject(i);
|
||||
if (val instanceof String str) {
|
||||
if (Text_Utils.has_leading_trailing_whitespace(str)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
context.safepoint();
|
||||
}
|
||||
return count;
|
||||
}
|
||||
}
|
@ -3,6 +3,7 @@ package org.enso.table.data.column.storage;
|
||||
import java.util.BitSet;
|
||||
import org.enso.base.CompareException;
|
||||
import org.enso.base.Text_Utils;
|
||||
import org.enso.table.data.column.operation.CountUntrimmed;
|
||||
import org.enso.table.data.column.operation.map.BinaryMapOperation;
|
||||
import org.enso.table.data.column.operation.map.MapOperationProblemAggregator;
|
||||
import org.enso.table.data.column.operation.map.MapOperationStorage;
|
||||
@ -20,6 +21,7 @@ import org.graalvm.polyglot.Context;
|
||||
public final class StringStorage extends SpecializedStorage<String> {
|
||||
|
||||
private final TextType type;
|
||||
private long _countLeadingTrailingWhitespace = -1;
|
||||
|
||||
/**
|
||||
* @param data the underlying data
|
||||
@ -46,6 +48,20 @@ public final class StringStorage extends SpecializedStorage<String> {
|
||||
return type;
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts the number of cells in the columns with whitespace.
|
||||
* Memoized into the storage for performance.
|
||||
* @return the number of cells with whitespace
|
||||
*/
|
||||
public Long countLeadingTrailingWhitespace() {
|
||||
if (_countLeadingTrailingWhitespace >= 0) {
|
||||
return _countLeadingTrailingWhitespace;
|
||||
}
|
||||
|
||||
_countLeadingTrailingWhitespace = CountUntrimmed.compute(this);
|
||||
return _countLeadingTrailingWhitespace;
|
||||
}
|
||||
|
||||
private static MapOperationStorage<String, SpecializedStorage<String>> buildOps() {
|
||||
MapOperationStorage<String, SpecializedStorage<String>> t = ObjectStorage.buildObjectOps();
|
||||
t.add(
|
||||
|
Loading…
Reference in New Issue
Block a user